
##### SET UP #####

### load libraries
library(foreign); library(WDI); library(stringr); library(emdist)
library(plyr); library(FactoMineR); library(repmis); library(haven)
library(reshape2); library(MASS); library(ggplot2); library(grid)
library(gridExtra); library(arm); library(readxl); library(readstata13)

### set up paths
setwd("/path/to/your/data")

### set random seed
set.seed(2365256)

### NB to the user: this script requires an internet connection, to use WDI() to 
### get an empty country-year panel. It might also be helpful to ensure that you
### follow the instructions for installing emdist available in the replication 
### archive for Lupu, Selios, and Warner (2017).

##### DEFINE UTILITIES #####

### function for making dyads
expand.grid.df <- function(...) Reduce(function(...) merge(..., by=NULL), 
                                       list(...))

### function for effective subsetting
completeFun <- function(data, desiredCols) {
  completeVec <- complete.cases(data[, desiredCols])
  return(data[completeVec, ])
}

### function for rescaling question responses from various intervals (e.g., 
### 0-10) to uniform ones (e.g., [-1,1])
rescalr <- function(x, theoryMin, theoryMax, newMin, newMax){
  (x - theoryMin)/(theoryMax - theoryMin)*(newMax - newMin) + newMin
}

### function for making quantiles
qcut <- function(x, n) {
  cut(x, quantile(x, seq(0, 1, length = n + 1), na.rm=T), 
      labels = (seq_len(n)-1), include.lowest = TRUE)
}

### function to help normalize country names
simpleCap <- function(x) {
  s <- strsplit(x, " ")[[1]]
  paste(toupper(substring(s, 1,1)), substring(s, 2),
        sep="", collapse=" ")
}

##### SETTING UP THE FINAL DATASET #####
### get a base country-year dataset from WDI
df.final <- WDI(country="all", start = 1960, end = 2015, extra = T,
                indicator = c("NY.GDP.PCAP.KD","SI.POV.GINI","NE.TRD.GNFS.ZS",
                              "BX.KLT.DINV.CD.WD", "DT.ODA.ALLD.GD.ZS", 
                              "DT.INT.DECT.GN.ZS", "GFDD.OI.13",
                              "SI.POV.NAHC", "SI.POV.NAGP"))
df.final <- df.final[df.final$region != "Aggregates",]
df.final <- df.final[!is.na(df.final$region),]
# fix names
names(df.final)[4:12] <- c("gdp","gini","trade_pc_gdp","fdi_net","oda_pc_gdp",
                           "ext_debt_interest", "remit_pc_gdp", "pov_pc_pop",
                           "pov_rate")
# log gdp
df.final$gdp_log <- log(df.final$gdp)
# express fdi as proportion of gdp
df.final$fdi_net_gdp <- df.final$fdi_net/df.final$gdp
# log fdi_net_gdp
df.final$fdi_net_gdp_log <- log(df.final$fdi_net_gdp)
# keep vars we need
df.final <- df.final[,c("country","year","iso2c","iso3c","gdp","gdp_log","gini",
                        "trade_pc_gdp","fdi_net","fdi_net_gdp","fdi_net_gdp_log",
                        "oda_pc_gdp","ext_debt_interest","remit_pc_gdp", 
                        "pov_pc_pop","pov_rate")]

### make a few adjustments
# Germany
df.final$country[which(df.final$country == "Germany" &
                         df.final$year < 1991)] <- "Germany West"
tmp <- df.final[df.final$country == "Germany West",]
tmp$country <- rep("Germany East",nrow(tmp))
df.final <- rbind(df.final,tmp)
# handle Serbia and Montenegro
df.final$country[which(df.final$country == "Serbia" & df.final$year >= 1990 &
                         df.final$year < 2006)] <- "Serbia and Montenegro"
tmp <- df.final[which(df.final$country == "Serbia" & df.final$year > 2005),]
tmp$country <- rep("Montenegro",nrow(tmp))
df.final <- rbind(df.final,tmp)
# add Taiwan
tmp <- df.final[which(df.final$country == "China"),]
tmp$country <- rep("Taiwan",nrow(tmp))
df.final <- rbind(df.final,tmp)
df.final <- df.final[order(df.final$country,df.final$year),]
df.final$emd_all <- df.final$emd_moreaffluent <- df.final$emd_lessaffluent <-
  rep(NA,nrow(df.final))
# some metadata
df.final$esurvey <- df.final$msurvey <- df.final$nobs_mass_poor <-
  df.final$nobs_mass_rich <- df.final$nobs_mass_all <- df.final$nobs_elite <-
  rep(NA,nrow(df.final))

##### BEGIN DATA CLEANING #####

##### ELITES #####
### 1.-5. ATES
files <- list.files(path = "./ATES", pattern = "\\.dta$", recursive = T)
files <- paste("./ATES/",files,sep="")
data.list <- lapply(files, function(x) read.dta(x, convert.factors = F))
# select only elements we want
data.list[[1]] <- data.list[[1]][,c("won","ideology")]
data.list[[1]]$party <- data.list[[1]]$sex <- rep(NA,nrow(data.list[[1]]))
data.list[[1]]$survey <- rep("ATES CAN 2003",nrow(data.list[[1]]))
data.list[[1]] <- data.list[[1]][which(data.list[[1]]$won < 4),]
# 4 is not elected
data.list[[2]] <- data.list[[2]][,c("won","ideology","party","sex")]
data.list[[2]]$party <- rep(NA,nrow(data.list[[2]])) # no codebook
data.list[[2]]$survey <- rep("ATES HOC 2004",nrow(data.list[[2]]))
data.list[[2]] <- data.list[[2]][which(data.list[[2]]$won == 1),]
# not sure what 3 means here, but 121 were elected, so...
data.list[[3]] <- data.list[[3]][,c("won","ideology","partycod","sex")]
data.list[[3]]$partycod <- rep(NA,nrow(data.list[[3]])) # no codebook
data.list[[3]]$survey <- rep("ATES CAN 2005",nrow(data.list[[3]]))
data.list[[3]] <- data.list[[3]][which(data.list[[3]]$won < 4),]
# 4 is not elected
data.list[[4]] <- data.list[[4]][,c("result","q7","party","sex")]
data.list[[4]]$survey <- rep("ATES HOR 2012",nrow(data.list[[4]]))
data.list[[4]] <- data.list[[4]][which(data.list[[4]]$result > 0),]
# 0 is not elected
data.list[[5]] <- data.list[[5]][,c("result","q12","party","sex")]
data.list[[5]]$survey <- rep("ATES HOC 2013",nrow(data.list[[5]]))
data.list[[5]] <- data.list[[5]][which(data.list[[5]]$result > 0),]
# 0 is not elected; this includes those not up for election
for(i in 1:length(files)){
  colnames(data.list[[i]]) <- c("result","ideology","party","sex","survey")
  data.list[[i]]$result <- as.character(data.list[[i]]$result)
  data.list[[i]]$ideology <- as.character(data.list[[i]]$ideology)
  data.list[[i]]$survey <- as.character(data.list[[i]]$survey)
  data.list[[i]]$party <- as.character(data.list[[i]]$party)
  data.list[[i]]$sex <- as.character(data.list[[i]]$sex)
}
ates <- do.call(rbind, data.list)
ates$result <- rep("Japan",nrow(ates))
colnames(ates)[1] <- "country"
# clean ideology
ates$ideology[ates$ideology == "66"] <- NA
ates$ideology[ates$ideology == "99"] <- NA
ates$ideology <- as.numeric(ates$ideology)
ates$ideology[which(ates$survey == "ATES CAN 2003" |
                      ates$survey == "ATES HOC 2004" |
                      ates$survey == "ATES CAN 2005")] <-
  ates$ideology[which(ates$survey == "ATES CAN 2003" |
                        ates$survey == "ATES HOC 2004" |
                        ates$survey == "ATES CAN 2005")] - 1
ates$ideology_scaled <- rescalr(ates$ideology, 0, 10, -1, 1)
ates$ideology_scale_orig <- rep("0-10",nrow(ates))
# clean sex
ates$sex[which(ates$sex == "1")] <- "Male"
ates$sex[which(ates$sex == "2")] <- "Female"
# clean party ID
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "1")] <- "Democratic Party of Japan"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "2")] <- "Liberal Democratic Party"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "3")] <- "Tomorrow Party of Japan"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "4")] <- "Komeito"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "5")] <- "Japan Restoration Party"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "6")] <- "Communist Party of Japan"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "7")] <- "Your Party"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "8")] <- "Social Democratic Party"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "9")] <- "New Party Daichi"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "10")] <- "People's New Party"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "11")] <- "New Party Nippon"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "12")] <- "New Renaissance Party"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "13")] <- "Other"
ates$party[which(ates$survey == "ATES HOR 2012" & ates$party == "14")] <- "Independent"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "1")] <- "Liberal Democratic Party"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "2")] <- "Democratic Party of Japan"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "3")] <- "Japan Restoration Party"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "4")] <- "Komeito"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "5")] <- "Your Party"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "6")] <- "Communist Party of Japan"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "7")] <- "People's Life Party"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "8")] <- "Social Democratic Party"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "9")] <- "Green Wind"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "10")] <- "New Party Daichi"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "11")] <- "New Renaissance Party"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "12")] <- "Happiness Realization Party"
ates$party[which(ates$survey == "ATES HOR 2013" & ates$party == "13")] <- "Greens Japan"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "14")] <- "Restoration Party"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "15")] <- "Genzei Nippon"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "16")] <- "Other"
ates$party[which(ates$survey == "ATES HOC 2013" & ates$party == "17")] <- "Independent"
# add legislative terms
ates$leg.begin <- ates$leg.end <- rep(NA,nrow(ates))
ates$leg.begin[ates$survey == "ATES CAN 2003"] <- "2003"
ates$leg.end[ates$survey == "ATES CAN 2003"] <- "2005"
ates$leg.begin[ates$survey == "ATES HOC 2004"] <- "2004"
ates$leg.end[ates$survey == "ATES HOC 2004"] <- "2007"
ates$leg.begin[ates$survey == "ATES CAN 2005"] <- "2005"
ates$leg.end[ates$survey == "ATES CAN 2005"] <- "2009"
ates$leg.begin[ates$survey == "ATES HOR 2012"] <- "2012"
ates$leg.end[ates$survey == "ATES HOR 2012"] <- "2014"
ates$leg.begin[ates$survey == "ATES HOC 2013"] <- "2013"
ates$leg.end[ates$survey == "ATES HOC 2013"] <- "2016"
# create panel
ates.full <- data.frame(matrix(nrow=0,ncol=8))
colnames(ates.full) <-  c("country","year","ideology","survey","party","sex",
                          "ideology_scaled","ideology_scale_orig")
for(i in 1:nrow(ates)){
  tmp <- ates[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","survey","party","sex",
                "ideology_scaled","ideology_scale_orig")]
  ates.full <- rbind(ates.full,tmp)
}
ates <- ates.full
ates <- ates[order(ates$country,ates$year),
             c("country","year","survey","ideology","ideology_scaled",
               "ideology_scale_orig","party","sex")]
# store for later
write.csv(ates, "./cleaned-data/ates.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 6. BCS 1992
bcs <- read.dta("./BCS1992/BCS1992.dta", convert.factors=F)
bcs <- bcs[,c("g2","g39","g1","g56")]
colnames(bcs) <- c("mp","ideology","party","sex")
bcs$country <- rep("United Kingdom",nrow(bcs))
bcs$mp <- as.character(bcs$mp)
bcs <- bcs[which(bcs$mp == "1"),]
bcs$ideology <- as.character(bcs$ideology)
bcs$ideology[bcs$ideology == "9"] <- NA
# clean sex
bcs$sex <- as.character(bcs$sex)
bcs$sex[which(bcs$sex == "1")] <- "Male"
bcs$sex[which(bcs$sex == "2")] <- "Female"
# clean party
bcs$party <- as.character(bcs$party)
bcs$party[which(bcs$party == "1")] <- "Conservative"
bcs$party[which(bcs$party == "2")] <- "Labour"
bcs$party[which(bcs$party == "3")] <- "Liberal Democrat"
bcs$party[which(bcs$party == "4")] <- "Scottish National Party"
bcs$party[which(bcs$party == "5")] <- "Plaid Cymru"
# tidy
bcs$ideology <- as.numeric(bcs$ideology)
bcs$ideology_scaled <- rescalr(bcs$ideology, 1, 7, -1, 1)
bcs$ideology_scale_orig <- rep("1-7",nrow(bcs))
bcs$survey <- rep("BCS 1992",nrow(bcs))
# create panel
bcs$leg.begin <- rep("1987", nrow(bcs)); bcs$leg.end <- rep("1992", nrow(bcs))
bcs.full <- data.frame(matrix(nrow=0,ncol=8))
colnames(bcs.full) <-  c("country","year","ideology","survey","party","sex",
                         "ideology_scaled","ideology_scale_orig")
for(i in 1:nrow(bcs)){
  tmp <- bcs[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","survey","party","sex",
                "ideology_scaled","ideology_scale_orig")]
  bcs.full <- rbind(bcs.full,tmp)
}
bcs <- bcs.full
bcs <- bcs[order(bcs$country,bcs$year),
           c("country","year","survey","ideology","ideology_scaled",
             "ideology_scale_orig","party","sex")]
# store for later
write.csv(bcs, "./cleaned-data/bcs-1992.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

#### 7. Brazilian Legislator Surveys (Power and Zucco 2011)
pz11 <- read_dta("./Power Zucco 2011/bls7_released_v01.dta")
pz11 <- as_factor(pz11)
pz11 <- as.data.frame(pz11)
pz11$country <- rep("Brazil",nrow(pz11))
pz11 <- pz11[,c("country","yearcase","lrclass","party_elected")]
colnames(pz11)[2:4] <- c("wave","ideology","party")  # czideo is scaled
pz11$sex <- rep(NA,nrow(pz11))
pz11$wave <- as.character(pz11$wave)
# ignore NA warning here
pz11$ideology <- as.numeric(as.character(pz11$ideology))
pz11$party <- as.character(pz11$party); pz11$party[pz11$party == "NA"] <- NA
pz11$wave <- str_extract(pz11$wave,"^\\d{4}")
pz11$leg.end <- pz11$leg.begin <- rep(NA,nrow(pz11))
# code legislative sessions -- this is annoying because it's not structured
# around elections, probably lots of overlap
pz11$leg.begin[which(pz11$wave == "1990")] <- "1990"
pz11$leg.end[which(pz11$wave == "1990")] <- "1994"
pz11$leg.begin[which(pz11$wave == "1993")] <- "1990"
pz11$leg.end[which(pz11$wave == "1993")] <- "1994"
pz11$leg.begin[which(pz11$wave == "1997")] <- "1994"
pz11$leg.end[which(pz11$wave == "1997")] <- "1998"
pz11$leg.begin[which(pz11$wave == "2001")] <- "1998"
pz11$leg.end[which(pz11$wave == "2001")] <- "2002"
pz11$leg.begin[which(pz11$wave == "2005")] <- "2002"
pz11$leg.end[which(pz11$wave == "2005")] <- "2006"
pz11$leg.begin[which(pz11$wave == "2009")] <- "2006"
pz11$leg.end[which(pz11$wave == "2009")] <- "2010"
pz11$leg.begin[which(pz11$wave == "2013")] <- "2010"
pz11$leg.end[which(pz11$wave == "2013")] <- "2014"
pz11$wave <- NULL
# make time series
pz11.full <- data.frame(matrix(nrow=0,ncol=7))
colnames(pz11.full) <- c("country","year","ideology","party","sex","leg.begin",
                         "leg.end")
for(i in 1:nrow(pz11)){
  tmp <- pz11[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","party","sex","leg.begin",
                "leg.end")]
  pz11.full <- rbind(pz11.full,tmp)
}
pz11 <- pz11.full; pz11.full <- NULL
### resolve some duplicate sampling - could do this above by subtracting 1 off
### of the leg.end, but we want to be clear about legislative terms
pz11 <- pz11[-which(pz11$leg.begin == "1990" & pz11$year == "1994"),]
pz11 <- pz11[-which(pz11$leg.begin == "1994" & pz11$year == "1998"),]
pz11 <- pz11[-which(pz11$leg.begin == "1998" & pz11$year == "2002"),]
pz11 <- pz11[-which(pz11$leg.begin == "2002" & pz11$year == "2006"),]
pz11 <- pz11[-which(pz11$leg.begin == "2006" & pz11$year == "2010"),]
# fix ideology
pz11$ideology[pz11$ideology == 99] <- NA
# cleaning up
pz11$ideology_scaled <- rescalr(pz11$ideology, 1, 10, -1, 1)
pz11$ideology_scale_orig <- rep("1-10",nrow(pz11))
pz11$survey <- rep("Brazilian Legislator Survey v1",nrow(pz11))
pz11 <- pz11[order(pz11$country,pz11$year),
             c("country","year","survey","ideology","ideology_scaled",
               "ideology_scale_orig","party","sex")]
# store for later
write.csv(pz11, "./cleaned-data/bls-v1.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 8. BRS 1997
brs <- read.dta("./BRS1997/BRS1997.dta", convert.factors=F)
brs <- brs[,c("mp_92","q23a","q1","q38b")]
colnames(brs) <- c("mp","ideology","party","sex")
brs$country <- rep("United Kingdom",nrow(brs))
brs$mp <- as.character(brs$mp); brs$ideology <- as.character(brs$ideology)
brs <- brs[which(brs$mp == "1"),]
brs$ideology[brs$ideology == "99"] <- NA
# clean party
brs$party <- as.character(brs$party)
brs$party[which(brs$party == "1")] <- "Conservative"
brs$party[which(brs$party == "2")] <- "Labour"
brs$party[which(brs$party == "3")] <- "Liberal Democrat"
brs$party[which(brs$party == "4")] <- "Scottish National Party"
# clean sex
brs$sex <- as.character(brs$sex)
brs$sex[which(brs$sex == "1")] <- "Male"
brs$sex[which(brs$sex == "2")] <- "Female"
brs$sex[which(brs$sex == "9")] <- NA
# tidy
brs$ideology <- as.numeric(brs$ideology)
brs$ideology_scaled <- rescalr(brs$ideology, 0, 10, -1, 1)
brs$ideology_scale_orig <- rep("0-10",nrow(brs))
brs$survey <- rep("BRS 1997",nrow(brs))
# create panel
brs$leg.begin <- rep("1992", nrow(brs))
brs$leg.end <- rep("1997", nrow(brs))
brs.full <- data.frame(matrix(nrow=0,ncol=8))
colnames(brs.full) <-  c("country","year","ideology","survey","party","sex",
                         "ideology_scaled","ideology_scale_orig")
for(i in 1:nrow(brs)){
  tmp <- brs[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","survey","party","sex",
                "ideology_scaled","ideology_scale_orig")]
  brs.full <- rbind(brs.full,tmp)
}
brs <- brs.full
brs <- brs[order(brs$country,brs$year),
           c("country","year","survey","ideology","ideology_scaled",
             "ideology_scale_orig","party","sex")]
# store for later
write.csv(brs, "./cleaned-data/brs-1997.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 9. BRS 2001
brs <- read.dta("./BRS2001/BRS2001.dta", convert.factors=F)
brs <- brs[,c("q2a1","q28a","q1","q45")]
colnames(brs) <- c("mp","ideology","party","sex")
brs$country <- rep("United Kingdom",nrow(brs))
brs$mp <- as.character(brs$mp); brs$ideology <- as.character(brs$ideology)
brs <- brs[which(brs$mp == "1"),]
brs$ideology[which(brs$ideology == "99" | brs$ideology == "88")] <- NA
# clean party
brs$party <- as.character(brs$party)
brs$party[which(brs$party == "1")] <- "Conservative"
brs$party[which(brs$party == "2")] <- "Labour"
brs$party[which(brs$party == "3")] <- "Liberal Democrat"
brs$party[which(brs$party == "5")] <- "Plaid Cymru"
# clean sex
brs$sex <- as.character(brs$sex)
brs$sex[which(brs$sex == "1")] <- "Male"
brs$sex[which(brs$sex == "2")] <- "Female"
brs$sex[which(brs$sex == "8")] <- NA
# tidy
brs$ideology <- as.numeric(brs$ideology)
brs$ideology_scaled <- rescalr(brs$ideology, 0, 10, -1, 1)
brs$ideology_scale_orig <- rep("0-10",nrow(brs))
brs$survey <- rep("BRS 2001",nrow(brs))
# create panel
brs$leg.begin <- rep("1997", nrow(brs))
brs$leg.end <- rep("2001", nrow(brs))
brs.full <- data.frame(matrix(nrow=0,ncol=8))
colnames(brs.full) <-  c("country","year","ideology","survey","party","sex",
                         "ideology_scaled","ideology_scale_orig")
for(i in 1:nrow(brs)){
  tmp <- brs[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","survey","party","sex",
                "ideology_scaled","ideology_scale_orig")]
  brs.full <- rbind(brs.full,tmp)
}
brs <- brs.full
brs <- brs[order(brs$country,brs$year),
           c("country","year","survey","ideology","ideology_scaled",
             "ideology_scale_orig","party","sex")]
# store for later
write.csv(brs, "./cleaned-data/brs-2001.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 10. CIRCAP 2006
cap06 <- read_dta("./CIRCAP 2006/CIRCAP-2006.dta")
cap06 <- as_factor(cap06)
cap06 <- as.data.frame(cap06)
cap06 <- cap06[cap06$v3 == "MEPs",]
cap06 <- cap06[,c("v2","v54","v56","v76","v77","v78","v79",
                  "v80","v81","v82","v83")]
colnames(cap06) <- c("country","ideology","sex","party_fr","party_de",
                     "party_uk","party_it","party_nd","party_pl","party_pt",
                     "party_es")
cap06$party_sv <- rep(NA,nrow(cap06))
for(i in 1:ncol(cap06)){
  cap06[,i] <- as.character(cap06[,i])
}
cap06$year <- rep("2006",nrow(cap06))
# fix ideology
cap06$ideology[cap06$ideology == "Refuse to answer/Don't know"] <- NA
cap06$ideology[cap06$ideology == "Extreme Left"] <- "1"
cap06$ideology[cap06$ideology == "Left"] <- "2"
cap06$ideology[cap06$ideology == "Center Left"] <- "3"
cap06$ideology[cap06$ideology == "Center"] <- "4"
cap06$ideology[cap06$ideology == "Center Right"] <- "5"
cap06$ideology[cap06$ideology == "Right"] <- "6"
cap06$ideology[cap06$ideology == "Extreme right"] <- "7"
# fix partisanship
cap06$party_fr[which(cap06$party_fr == "Not asked")] <- NA
cap06$party_de[which(cap06$party_de == "Refuse to answer/Don't know")] <- NA
cap06$party_de[which(cap06$party_de ==
                       "B?ndnis 90\\Die Gr?nen")] <- "Bundnis90 Die Grunen"
cap06$party_de[which(cap06$party_de == "CDU\\CSU")] <- "CDU CSU"
cap06$party_de[which(cap06$party_de == "Linkspartei. PDS")] <- "Linkspartei PDS"
cap06$party_de[which(cap06$party_de == "SPD\\FDP")] <- "SPD FDP"
cap06$party_uk[which(cap06$party_uk ==
                       "Democratic Unionist Party (Northern Ireland)")] <- "DUP"
cap06$party_uk[which(cap06$party_uk == "Not asked")] <- NA
cap06$party_it <- gsub("\\.","",cap06$party_it)
cap06$party_it <- gsub("(","",cap06$party_it,fixed=T)
cap06$party_it[which(cap06$party_it == "Not asked")] <- NA
cap06$party_nd[which(cap06$party_nd == "Not asked")] <- NA
cap06$party_nd[which(cap06$party_nd ==
                       "Socialistische Partij (SP)")] <- "Socialistische Partij"
cap06$party_pl <- gsub("\\.","",cap06$party_pl)
cap06$party_pl[which(cap06$party_pl == "Not asked")] <- NA
cap06$party_pl <- gsub("- ","",cap06$party_pl,fixed=T)
cap06$party_pt[which(cap06$party_pt == "Not asked")] <- NA
cap06$party_pt[which(cap06$party_pt ==
                       "Coliga?ao For?a Portugal (PPD/PSD.CDS-PP)")] <- "PPD PSD CDS PP"
cap06$party_pt[which(cap06$party_pt ==
                       "Partido Comunista Portugu?s/Coliga??o Democr?tica U")] <- "PCP CDU"
cap06$party_es[which(cap06$party_es == "Not asked")] <- NA
cap06$party_es[which(cap06$party_es ==
                       "CDC (Convergencia Democr?tica de Catalunya)")] <- "CDC"
cap06$party_es[which(cap06$party_es ==
                       "PNV (Partido Nacionalista Vasco)")] <- "PNV"
# merge party
cap06$party <- rep(NA,nrow(cap06))
cap06$party <- ifelse(cap06$country == "France", cap06$party_fr, cap06$party)
cap06$party <- ifelse(cap06$country == "Germany", cap06$party_de, cap06$party)
cap06$party <- ifelse(cap06$country == "Italy", cap06$party_it, cap06$party)
cap06$party <- ifelse(cap06$country == "Poland", cap06$party_pl, cap06$party)
cap06$party <- ifelse(cap06$country == "Portugal", cap06$party_pt, cap06$party)
cap06$party <- ifelse(cap06$country == "Slovakia", cap06$party_sv, cap06$party)
cap06$party <- ifelse(cap06$country == "Spain", cap06$party_es, cap06$party)
cap06$party <- ifelse(cap06$country == "The Netherlands", cap06$party_nd,
                      cap06$party)
cap06$party <- ifelse(cap06$country == "UK", cap06$party_uk, cap06$party)
cap06$party[which(cap06$party == "-998")] <- NA
# fix countries
cap06$country[which(cap06$country == "The Netherlands")] <- "Netherlands"
cap06$country[which(cap06$country == "Slovakia")] <- "Slovak Republic"
cap06$country[which(cap06$country == "UK")] <- "United Kingdom"
# tidy
cap06$ideology <- as.numeric(cap06$ideology)
cap06$ideology[which(cap06$ideology == -999)] <- NA
cap06$ideology_scaled <- rescalr(cap06$ideology, 1, 7, -1, 1)
cap06$ideology_scale_orig <- rep("1-7",nrow(cap06))
cap06$survey <- rep("CIRCAP 2006",nrow(cap06))
cap06$year <- as.numeric(cap06$year)
cap06 <- cap06[order(cap06$country,cap06$year),
               c("country","year","survey","ideology","ideology_scaled",
                 "ideology_scale_orig","party","sex")]
# store for later
write.csv(cap06, "./cleaned-data/circap-2006.csv", row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 11. CIRCaP 2007
cap07 <- read.dta("./CIRCAP 2007/CIRCAP-2007.dta")
cap07 <- cap07[cap07$v2 == "Parliamentarians",]
cap07 <- cap07[,c("v3","v66","v5","v98","v99","v100","v101",
                  "v102","v103","v104","v105","v106")]
colnames(cap07) <- c("country","ideology","sex","party_de","party_fr",
                     "party_it","party_es","party_nd","party_pt","party_uk",
                     "party_pl","party_sv")
for(i in 1:ncol(cap07)){
  cap07[,i] <- as.character(cap07[,i])
}
# fix ideology
cap07$ideology[cap07$ideology == "Refuse to answer/Don't know"] <- NA
cap07$ideology[cap07$ideology == "Extreme left"] <- "1"
cap07$ideology[cap07$ideology == "Left"] <- "2"
cap07$ideology[cap07$ideology == "Center left"] <- "3"
cap07$ideology[cap07$ideology == "Center"] <- "4"
cap07$ideology[cap07$ideology == "Center right"] <- "5"
cap07$ideology[cap07$ideology == "Right"] <- "6"
cap07$ideology[cap07$ideology == "Extreme right"] <- "7"
# merge party
cap07$party <- rep(NA,nrow(cap07))
cap07$party <- ifelse(cap07$country == "France", cap07$party_fr, cap07$party)
cap07$party <- ifelse(cap07$country == "Germany", cap07$party_de, cap07$party)
cap07$party <- ifelse(cap07$country == "Italy", cap07$party_it, cap07$party)
cap07$party <- ifelse(cap07$country == "Netherlands", cap07$party_nd, cap07$party)
cap07$party <- ifelse(cap07$country == "Poland", cap07$party_pl, cap07$party)
cap07$party <- ifelse(cap07$country == "Portugal", cap07$party_pt, cap07$party)
cap07$party <- ifelse(cap07$country == "Slovakia", cap07$party_sv, cap07$party)
cap07$party <- ifelse(cap07$country == "Spain", cap07$party_es, cap07$party)
cap07$party <- ifelse(cap07$country == "Netherlands", cap07$party_nd, cap07$party)
cap07$party <- ifelse(cap07$country == "UK", cap07$party_uk, cap07$party)
# clean party
cap07$party <- gsub("[^[:alnum:] ]", "", cap07$party)
cap07$party[which(cap07$party == "None")] <- NA
# fix countries
cap07$country[which(cap07$country == "Slovakia")] <- "Slovak Republic"
cap07$country[which(cap07$country == "UK")] <- "United Kingdom"
# tidy
cap07$ideology <- as.numeric(cap07$ideology)
cap07$ideology_scaled <- rescalr(cap07$ideology, 1, 7, -1, 1)
cap07$ideology_scale_orig <- rep("1-7",nrow(cap07))
cap07$survey <- rep("CIRCAP 2007",nrow(cap07))
cap07$year <- rep("2007",nrow(cap07))
cap07 <- cap07[order(cap07$country,cap07$year),
               c("country","year","survey","ideology","ideology_scaled",
                 "ideology_scale_orig","party","sex")]
# store for later
write.csv(cap07, "./cleaned-data/circap-2007.csv", row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 12. Comparative Candidates Survey
ccs <- read_spss("./CCS/620_CCS_Data_Wave1_v4.0.sav")
ccs <- as_factor(ccs)
ccs <- as.data.frame(ccs)
ccs <- ccs[,c("t1","t3","t8","c3","a1","e1")]
colnames(ccs) <- c("country","leg.begin","elected","ideology","party","sex")
ccs$leg.begin <- as.character(ccs$leg.begin)
# clean ideology
ccs$ideology <- as.character(ccs$ideology)
ccs$ideology[which(ccs$ideology == "left")] <- "0"
ccs$ideology[which(ccs$ideology == "right")] <- "10"
ccs$ideology <- as.numeric(ccs$ideology)
# restrict to those we have data for, and who were elected
ccs$country <- as.character(ccs$country)
ccs$elected <- as.character(ccs$elected)
ccs <- ccs[which(ccs$elected == "Elected"),]
ccs$elected <- NULL
# clean sex
ccs$sex <- as.character(ccs$sex)
ccs$sex[which(ccs$sex == "male")] <- "Male"
ccs$sex[which(ccs$sex == "female")] <- "Female"
# clean party
ccs$party <- as.character(ccs$party)
# Germany 2005
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2005" &
                  ccs$party == "1")] <- "SPD"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2005" &
                  ccs$party == "2")] <- "CDU"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2005" &
                  ccs$party == "3")] <- "CSU"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2005" &
                  ccs$party == "4")] <- "FDP"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2005" &
                  ccs$party == "5")] <- "Bundnis90 Die Grunen"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2005" &
                  ccs$party == "6")] <- "Linke PDS"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2005" &
                  ccs$party == "7")] <- "WASG"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2005" &
                  ccs$party == "97")] <- NA
# Netherlands 2006
ccs$party[which(ccs$country == "Netherlands" & ccs$leg.begin == "2006" &
                  ccs$party == "1")] <- "CDA"
ccs$party[which(ccs$country == "Netherlands" & ccs$leg.begin == "2006" &
                  ccs$party == "2")] <- "PvdA"
ccs$party[which(ccs$country == "Netherlands" & ccs$leg.begin == "2006" &
                  ccs$party == "3")] <- "SP"
ccs$party[which(ccs$country == "Netherlands" & ccs$leg.begin == "2006" &
                  ccs$party == "4")] <- "VVD"
ccs$party[which(ccs$country == "Netherlands" & ccs$leg.begin == "2006" &
                  ccs$party == "6")] <- "Groen Links"
ccs$party[which(ccs$country == "Netherlands" & ccs$leg.begin == "2006" &
                  ccs$party == "9")] <- "SGP"
# Australia 2007
ccs$party[which(ccs$country == "Australia" & ccs$leg.begin == "2007" &
                  ccs$party == "1")] <- "Liberal Party of Australia"
ccs$party[which(ccs$country == "Australia" & ccs$leg.begin == "2007" &
                  ccs$party == "2")] <- "Australian Labor Party"
ccs$party[which(ccs$country == "Australia" & ccs$leg.begin == "2007" &
                  ccs$party == "3")] <- "National Party of Australia"
ccs$party[which(ccs$country == "Australia" & ccs$leg.begin == "2007" &
                  ccs$party == "5")] <- "Australian Greens"
# Belgium 2007
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "1")] <- "PS"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "2")] <- "MR"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "3")] <- "CDH"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "4")] <- "ECOLO"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "5")] <- "FN"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "11")] <- "CDandV"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "13")] <- "spa"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "14")] <- "spirit"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "15")] <- "Open VLD"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2007" &
                  ccs$party == "16")] <- "VlaamsBelang"
# Finland 2007
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "1")] <- "Centre Party"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "2")] <- "National Coalition Pary"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "3")] <- "Social Democrats"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "4")] <- "Left Alliance"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "5")] <- "Greens"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "6")] <- "Christian Democrats"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "7")] <- "Swedish Peoples Party"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "8")] <- "True Finns"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2007" &
                  ccs$party == "10")] <- "Liberals"
# Switzerland 2007
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "1")] <- "CVP PDC"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "2")] <- "FDP PRD"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "3")] <- "SVP UDC"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "4")] <- "SPS PSS"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "5")] <- "GPS PES"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "7")] <- "LPS PLS"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "9")] <- "EVP PEV"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "10")] <- "EDU UDF"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "12")] <- "Lega"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2007" &
                  ccs$party == "13")] <- "Pda PST"
# Austria 2008
ccs$party[which(ccs$country == "Austria" & ccs$leg.begin == "2008" &
                  ccs$party == "1")] <- "SPOE"
ccs$party[which(ccs$country == "Austria" & ccs$leg.begin == "2008" &
                  ccs$party == "2")] <- "OEVP"
ccs$party[which(ccs$country == "Austria" & ccs$leg.begin == "2008" &
                  ccs$party == "3")] <- "Gruene"
ccs$party[which(ccs$country == "Austria" & ccs$leg.begin == "2008" &
                  ccs$party == "4")] <- "FPOE"
ccs$party[which(ccs$country == "Austria" & ccs$leg.begin == "2008" &
                  ccs$party == "5")] <- "BZOE"
# Germany 2009
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2009" &
                  ccs$party == "1")] <- "SPD"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2009" &
                  ccs$party == "2")] <- "CDU"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2009" &
                  ccs$party == "3")] <- "CSU"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2009" &
                  ccs$party == "4")] <- "FDP"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2009" &
                  ccs$party == "5")] <- "Bundnis90 Die Grunen"
ccs$party[which(ccs$country == "Germany" & ccs$leg.begin == "2009" &
                  ccs$party == "6")] <- "Die Linke"
# Iceland 2009
ccs$party[which(ccs$country == "Iceland" & ccs$leg.begin == "2009" &
                  ccs$party == "1")] <- "Social Democratic Alliance"
ccs$party[which(ccs$country == "Iceland" & ccs$leg.begin == "2009" &
                  ccs$party == "2")] <- "Progressive Party"
ccs$party[which(ccs$country == "Iceland" & ccs$leg.begin == "2009" &
                  ccs$party == "3")] <- "Independent Pary Conservative"
ccs$party[which(ccs$country == "Iceland" & ccs$leg.begin == "2009" &
                  ccs$party == "4")] <- "Left Green Movement"
ccs$party[which(ccs$country == "Iceland" & ccs$leg.begin == "2009" &
                  ccs$party == "6")] <- "Civic Movement"
# Belgium 2010
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "1")] <- "PS"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "2")] <- "MR"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "3")] <- "CDH"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "4")] <- "ECOLO"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "11")] <- "CDandV"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "12")] <- "NVA"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "13")] <- "SPA"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "15")] <- "VLD"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "16")] <- "VlaamsBelang VB"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "17")] <- "GROEN"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "18")] <- "Lijst Dedecker"
ccs$party[which(ccs$country == "Belgium" & ccs$leg.begin == "2010" &
                  ccs$party == "19")] <- "PP"
# Hungary 2010
ccs$party[which(ccs$country == "Hungary" & ccs$leg.begin == "2010" &
                  ccs$party == "1")] <- "Fidesz"
ccs$party[which(ccs$country == "Hungary" & ccs$leg.begin == "2010" &
                  ccs$party == "2")] <- "KDNP"
ccs$party[which(ccs$country == "Hungary" & ccs$leg.begin == "2010" &
                  ccs$party == "3")] <- "MSZP"
ccs$party[which(ccs$country == "Hungary" & ccs$leg.begin == "2010" &
                  ccs$party == "4")] <- "Jobbik"
ccs$party[which(ccs$country == "Hungary" & ccs$leg.begin == "2010" &
                  ccs$party == "5")] <- "LMP"
ccs$party[which(ccs$country == "Hungary" & ccs$leg.begin == "2010" &
                  ccs$party == "20")] <- "Other"
ccs$party[which(ccs$country == "Hungary" & ccs$leg.begin == "2010" &
                  ccs$party == "97")] <- "No party"
# UK 2010
ccs$party[which(ccs$country == "UK" & ccs$leg.begin == "2010" &
                  ccs$party == "1")] <- "Labour"
ccs$party[which(ccs$country == "UK" & ccs$leg.begin == "2010" &
                  ccs$party == "2")] <- "Conservative"
ccs$party[which(ccs$country == "UK" & ccs$leg.begin == "2010" &
                  ccs$party == "3")] <- "Liberal Democrat"
ccs$party[which(ccs$country == "UK" & ccs$leg.begin == "2010" &
                  ccs$party == "6")] <- "Scottish National Party"
ccs$party[which(ccs$country == "UK" & ccs$leg.begin == "2010" &
                  ccs$party == "7")] <- "Plaid Cymru"
# Estonia 2011
ccs$party[which(ccs$country == "Estonia" & ccs$leg.begin == "2011" &
                  ccs$party == "2")] <- "Estonian Center Party"
ccs$party[which(ccs$country == "Estonia" & ccs$leg.begin == "2011" &
                  ccs$party == "3")] <- "Estonian Reform Party"
ccs$party[which(ccs$country == "Estonia" & ccs$leg.begin == "2011" &
                  ccs$party == "7")] <- "Union of Pro Patria and Res Publica"
ccs$party[which(ccs$country == "Estonia" & ccs$leg.begin == "2011" &
                  ccs$party == "8")] <- "Social Democratic Party"
# Finland 2011
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "1")] <- "National Coalition Party"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "2")] <- "Social Democratic Party"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "3")] <- "The Finns Party"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "4")] <- "Center Party of Finland"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "5")] <- "The Left Alliance"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "6")] <- "Green League"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "7")] <- "Christian Democrats in Finland"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "8")] <- "Swedish Peoples Party in Finland"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "10")] <- "Communist Party"
ccs$party[which(ccs$country == "Finland" & ccs$leg.begin == "2011" &
                  ccs$party == "20")] <- "Other"
# New Zealand 2011
ccs$party[which(ccs$country == "New Zealand" & ccs$leg.begin == "2011" &
                  ccs$party == "1")] <- "Labour"
ccs$party[which(ccs$country == "New Zealand" & ccs$leg.begin == "2011" &
                  ccs$party == "2")] <- "National"
ccs$party[which(ccs$country == "New Zealand" & ccs$leg.begin == "2011" &
                  ccs$party == "3")] <- "Green"
ccs$party[which(ccs$country == "New Zealand" & ccs$leg.begin == "2011" &
                  ccs$party == "4")] <- "NZ First"
ccs$party[which(ccs$country == "New Zealand" & ccs$leg.begin == "2011" &
                  ccs$party == "5")] <- "Maori Party"
ccs$party[which(ccs$country == "New Zealand" & ccs$leg.begin == "2011" &
                  ccs$party == "6")] <- "Act"
ccs$party[which(ccs$country == "New Zealand" & ccs$leg.begin == "2011" &
                  ccs$party == "7")] <- "United Future"
# Portugal 2011
ccs$party[which(ccs$country == "Portugal" & ccs$leg.begin == "2011" &
                  ccs$party == "1")] <- "BE"
ccs$party[which(ccs$country == "Portugal" & ccs$leg.begin == "2011" &
                  ccs$party == "2")] <- "CDS PP"
ccs$party[which(ccs$country == "Portugal" & ccs$leg.begin == "2011" &
                  ccs$party == "3")] <- "PEV"
ccs$party[which(ccs$country == "Portugal" & ccs$leg.begin == "2011" &
                  ccs$party == "4")] <- "PCP"
ccs$party[which(ccs$country == "Portugal" & ccs$leg.begin == "2011" &
                  ccs$party == "5")] <- "PSD"
ccs$party[which(ccs$country == "Portugal" & ccs$leg.begin == "2011" &
                  ccs$party == "6")] <- "PS"
# Switzerland 2011
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "1")] <- "FDP PRD"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "2")] <- "CVP PDC"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "3")] <- "SP PS"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "4")] <- "SVP UDC"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "7")] <- "EVP PEP"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "10")] <- "GLP Vertliberaux"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "11")] <- "MCG"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "13")] <- "GPS PES"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "18")] <- "Lega"
ccs$party[which(ccs$country == "Switzerland" & ccs$leg.begin == "2011" &
                  ccs$party == "19")] <- "BDP"
# Greece 2012
ccs$party[which(ccs$country == "Greece" & ccs$leg.begin == "2012" &
                  ccs$party == "1")] <- "ND"
ccs$party[which(ccs$country == "Greece" & ccs$leg.begin == "2012" &
                  ccs$party == "2")] <- "PASOK"
ccs$party[which(ccs$country == "Greece" & ccs$leg.begin == "2012" &
                  ccs$party == "3")] <- "Syriza"
ccs$party[which(ccs$country == "Greece" & ccs$leg.begin == "2012" &
                  ccs$party == "4")] <- "ANEL"
ccs$party[which(ccs$country == "Greece" & ccs$leg.begin == "2012" &
                  ccs$party == "5")] <- "CA Golden Dawn"
ccs$party[which(ccs$country == "Greece" & ccs$leg.begin == "2012" &
                  ccs$party == "6")] <- "DIMAR"
# Romania 2012
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "1")] <- "PSD"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "2")] <- "PNL"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "3")] <- "PC"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "4")] <- "UNPR"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "5")] <- "USL"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "6")] <- "PDL"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "8")] <- "FC"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "9")] <- "ARD"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "10")] <- "UDMR"
ccs$party[which(ccs$country == "Romania" & ccs$leg.begin == "2012" &
                  ccs$party == "11")] <- "PPDD"
# Italy 2013
ccs$party[which(ccs$country == "Italy" & ccs$leg.begin == "2013" &
                  ccs$party == "2")] <- "Fratelli dItalia"
ccs$party[which(ccs$country == "Italy" & ccs$leg.begin == "2013" &
                  ccs$party == "3")] <- "Il Popolo Della Liberta"
ccs$party[which(ccs$country == "Italy" & ccs$leg.begin == "2013" &
                  ccs$party == "4")] <- "Lega Nord"
ccs$party[which(ccs$country == "Italy" & ccs$leg.begin == "2013" &
                  ccs$party == "5")] <- "Movimento 5 Stelle Beppegrillo"
ccs$party[which(ccs$country == "Italy" & ccs$leg.begin == "2013" &
                  ccs$party == "6")] <- "Partito Democratico"
ccs$party[which(ccs$country == "Italy" & ccs$leg.begin == "2013" &
                  ccs$party == "8")] <- "Scelta Civica Con Monti Per lItalia"
ccs$party[which(ccs$country == "Italy" & ccs$leg.begin == "2013" &
                  ccs$party == "9")] <- "Sinistra Ecologia Liberta"
ccs$party[which(ccs$country == "Italy" & ccs$leg.begin == "2013" &
                  ccs$party == "11")] <- "Unione di Centro"
# finally, remove "no party"
ccs$party[which(ccs$party == "no party")] <- NA
# we need to manually code some end term years
ccs$leg.end <- rep(NA, nrow(ccs))
ccs$leg.end[which(ccs$country == "Australia" &
                    ccs$leg.begin == "2007")] <- "2010"
# 42nd Parliament
ccs$leg.end[which(ccs$country == "Austria" &
                    ccs$leg.begin == "2008")] <- "2013"
# 24th election
ccs$leg.end[which(ccs$country == "Belgium" &
                    ccs$leg.begin == "2007")] <- "2010"
# Federal election
ccs$leg.end[which(ccs$country == "Belgium" &
                    ccs$leg.begin == "2010")] <- "2014"
# Federal election
ccs$leg.end[which(ccs$country == "Estonia" &
                    ccs$leg.begin == "2011")] <- "2015"
# General election
ccs$leg.end[which(ccs$country == "Finland" &
                    ccs$leg.begin == "2007")] <- "2011"
# 35th Parliament
ccs$leg.end[which(ccs$country == "Finland" &
                    ccs$leg.begin == "2011")] <- "2015"
# 36th Parliament
ccs$leg.end[which(ccs$country == "Germany" &
                    ccs$leg.begin == "2005")] <- "2009"
# 16th Bundestag
ccs$leg.end[which(ccs$country == "Germany" &
                    ccs$leg.begin == "2009")] <- "2013"
# 17th Bundestag
ccs$leg.end[which(ccs$country == "Greece" &
                    ccs$leg.begin == "2012")] <- "2015"
# 16th Parliament
ccs$leg.end[which(ccs$country == "Hungary" &
                    ccs$leg.begin == "2010")] <- "2014"
# 6th (?) Parliament
ccs$leg.end[which(ccs$country == "Iceland" &
                    ccs$leg.begin == "2009")] <- "2013"
# Gen Elec - Parliament
ccs$leg.end[which(ccs$country == "Italy" &
                    ccs$leg.begin == "2013")] <- "2018"
# 17th Parliament
ccs$leg.end[which(ccs$country == "Netherlands" &
                    ccs$leg.begin == "2006")] <- "2010"
# Gen elec - parliament
ccs$leg.end[which(ccs$country == "New Zealand" &
                    ccs$leg.begin == "2011")] <- "2014"
# Gen elec - parliament
ccs$leg.end[which(ccs$country == "Portugal" &
                    ccs$leg.begin == "2011")] <- "2015"
# general election
ccs$leg.end[which(ccs$country == "Romania" &
                    ccs$leg.begin == "2012")] <- "2016"
# general election
ccs$leg.end[which(ccs$country == "Switzerland" &
                    ccs$leg.begin == "2007")] <- "2011"
# National Council and Council of States
ccs$leg.end[which(ccs$country == "Switzerland" &
                    ccs$leg.begin == "2011")] <- "2015"
# National Council and Council of States
ccs$leg.end[which(ccs$country == "UK" &
                    ccs$leg.begin == "2010")] <- "2015"
# General election
# expand into time-series
ccs.full <- data.frame(matrix(nrow=0,ncol=7))
colnames(ccs.full) <- c("country","year","ideology","party","sex",
                        "leg.begin","leg.end")
for(i in 1:nrow(ccs)){
  tmp <- ccs[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","party","sex","leg.begin",
                "leg.end")]
  ccs.full <- rbind(ccs.full,tmp)
}
ccs <- ccs.full; ccs.full <- NULL
### now resolve some duplicates from overlapping years
# Belgium 2007-2010, remove 2010 data
ccs <- ccs[-which(ccs$country == "Belgium" & ccs$year == "2010" &
                    ccs$leg.begin == "2007"),]
# Finland 2007-2011, remove 2011 data
ccs <- ccs[-which(ccs$country == "Finland" & ccs$year == "2011" &
                    ccs$leg.begin == "2007"),]
# Germany 2005-2009, remove 2009 data
ccs <- ccs[-which(ccs$country == "Germany" & ccs$year == "2009" &
                    ccs$leg.begin == "2005"),]
# Switzerland 2007-2011, remove 2011 data
ccs <- ccs[-which(ccs$country == "Switzerland" & ccs$year == "2011" &
                    ccs$leg.begin == "2007"),]
# rescale
ccs$ideology_scaled <- rescalr(ccs$ideology, 0, 10, -1, 1)
ccs$ideology_scale_orig <- rep("0-10",nrow(ccs))
ccs$survey <- rep("Comparative Candidates Survey",nrow(ccs))
# clean country
ccs$country[which(ccs$country == "UK")] <- "United Kingdom"
# clean up
ccs <- ccs[order(ccs$country,ccs$year),
           c("country","year","survey","ideology","ideology_scaled",
             "ideology_scale_orig","party","sex")]
# store for later
write.csv(ccs, "./cleaned-data/ccs.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 13. EPRG MEP surveys
eprg <- read.dta("./EPRG MEP survey/EPRG MEP 2000-2015 merge 20 Oct 2016 stata12.dta")
eprg <- eprg[,c("q1_1","b_3","q3_1_1","q1_7","q1_3")]
colnames(eprg) <- c("country","wave","ideology","sex","party")
eprg$country <- as.character(eprg$country)
eprg$wave <- as.character(eprg$wave)
eprg$ideology <- as.character(eprg$ideology)
eprg$year <- rep(NA,nrow(eprg))
eprg$year[eprg$wave == "1"] <- "2000"
eprg$year[eprg$wave == "2"] <- "2006"
eprg$year[eprg$wave == "3"] <- "2011"
eprg$year[eprg$wave == "4"] <- "2015"
# fix countries
cs <- c("Belgium", "Bulgaria", "Czech Republic", "Denmark", "Germany",
        "Estonia", "Ireland", "Greece", "Spain", "France", "Italy", "Cyprus",
        "Latvia", "Lithuania", "Luxembourg", "Hungary", "Malta", "Netherlands",
        "Austria", "Poland", "Portugal", "Romania", "Slovenia",
        "Slovak Republic", "Finland", "Sweden", "United Kingdom","Croatia")
nm <- as.character(c(1:28))
for(i in 1:length(nm)){
  eprg$country[which(eprg$country == nm[i])] <- cs[i]
}
# create legislative begin/end years
eprg$leg.begin <- eprg$leg.end <- rep(NA,nrow(eprg))
# note they're all elected together at the same time, so countries don't matter
eprg$leg.begin[which(eprg$year == "2000")] <- "1999"
eprg$leg.end[which(eprg$year == "2000")] <- "2004"
eprg$leg.begin[which(eprg$year == "2006")] <- "2004"
eprg$leg.end[which(eprg$year == "2006")] <- "2009"
eprg$leg.begin[which(eprg$year == "2011")] <- "2009"
eprg$leg.end[which(eprg$year == "2011")] <- "2014"
eprg$leg.begin[which(eprg$year == "2015")] <- "2014"
eprg$leg.end[which(eprg$year == "2015")] <- "2019"
# store wave info
eprg$survey <- rep(NA,nrow(eprg))
eprg$survey[which(eprg$leg.begin == "1999")] <- "EPRG MEP Survey Wave 1"
eprg$survey[which(eprg$leg.begin == "2004")] <- "EPRG MEP Survey Wave 2"
eprg$survey[which(eprg$leg.begin == "2009")] <- "EPRG MEP Survey Wave 3"
eprg$survey[which(eprg$leg.begin == "2014")] <- "EPRG MEP Survey Wave 4"
# fix parties
eprg$party <- gsub("[^[:alnum:] ]", "", eprg$party)
eprg$party <- str_trim(eprg$party,"both")
eprg$party[eprg$party==""] <- NA
# fix sex
eprg$sex[which(eprg$sex == "1")] <- "Male"
eprg$sex[which(eprg$sex == "2")] <- "Female"
# expand into time-series
eprg.full <- data.frame(matrix(nrow=0,ncol=6))
colnames(eprg.full) <- c("country","year","ideology","survey","party","sex")
for(i in 1:nrow(eprg)){
  tmp <- eprg[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","survey","party","sex")]
  eprg.full <- rbind(eprg.full,tmp)
}
eprg <- eprg.full; eprg.full <- NULL
# clean up
eprg$ideology <- as.numeric(eprg$ideology)
# note they switched from 10- to 11-point scale in wave 3... obnoxious
eprg$ideology_scaled <- eprg$ideology_scale_orig <- rep(NA,nrow(eprg))
eplist <- split(eprg, f = eprg$survey)
eplist[[1]]$ideology_scaled <- rescalr(eplist[[1]]$ideology, 1, 10, -1, 1)
eplist[[1]]$ideology_scale_orig <- rep("1-10",nrow(eplist[[1]]))
eplist[[2]]$ideology_scaled <- rescalr(eplist[[2]]$ideology, 1, 10, -1, 1)
eplist[[2]]$ideology_scale_orig <- rep("1-10",nrow(eplist[[2]]))
eplist[[3]]$ideology <- eplist[[3]]$ideology - 1
eplist[[3]]$ideology_scaled <- rescalr(eplist[[3]]$ideology, 0, 10, -1, 1)
eplist[[3]]$ideology_scale_orig <- rep("0-10",nrow(eplist[[3]]))
eplist[[4]]$ideology <- eplist[[4]]$ideology - 1
eplist[[4]]$ideology_scaled <- rescalr(eplist[[4]]$ideology, 0, 10, -1, 1)
eplist[[4]]$ideology_scale_orig <- rep("0-10",nrow(eplist[[4]]))
eprg <- do.call(rbind, eplist)
# tidy up
eprg <- eprg[order(eprg$country,eprg$year),
             c("country","year","survey","ideology","ideology_scaled",
               "ideology_scale_orig","party","sex")]
# store for later
write.csv(eprg, "./cleaned-data/eprg.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 14. Flash Eurobarometer 1996
feb <- read.dta("./Flash Eurobarometer/ZA2896.dta")
feb <- feb[,c("country","quest","q12","group","sex")]
colnames(feb) <- c("country","year","ideology","elected","sex")
# restrict to those we have data for, and who were elected
for(i in 1:ncol(feb)){
  feb[,i] <- as.character(feb[,i])
}
Sys.setlocale(locale="C")
feb <- feb[which(grepl("political",feb$elected) == T),]
feb$elected <- NULL
# fix country
feb$country <- sapply(feb$country, simpleCap)
feb$country[feb$country == "Belgique"] <- "Belgium"
feb$country[feb$country == "Danmark"] <- "Denmark"
feb$country[feb$country == "Deutschland"] <- "Germany"
feb$country[feb$country == "Italia"] <- "Italy"
feb$country[feb$country == "UNITED KINGDOM"] <- "United Kingdom"
feb$country[feb$country == "Ellas"] <- "Greece"
feb$country[feb$country == "Espana"] <- "Spain"
feb$country[feb$country == "Nederland"] <- "Netherlands"
# fix ideology
feb$ideology <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", feb$ideology, perl=TRUE)
feb$ideology[which(feb$ideology == "(s.r.) (n.a.)" |
                     feb$ideology == "(n.s.p.) (d.k.)")] <- NA
feb$ideology[feb$ideology == "1= gauche left"] <- "1"
feb$ideology[feb$ideology == "10= droite right"] <- "10"
feb$ideology <- as.numeric(feb$ideology)
# clean sex
feb$sex[which(grepl("femmes",feb$sex)==T)] <- "Female"
feb$sex[which(grepl("hommes",feb$sex)==T)] <- "Male"
# no party variable
feb$party <- rep(NA,nrow(feb))
# insert year -- everything was 1996
feb$year <- rep("1996",nrow(feb))
# scale
feb$ideology_scaled <- rescalr(feb$ideology, 1, 10, -1, 1)
feb$ideology_scale_orig <- rep("1-10",nrow(feb))
feb$survey <- rep("Flash Eurobarometer 1996",nrow(feb))
# clean up
feb <- feb[order(feb$country,feb$year),
           c("country","year","survey","ideology","ideology_scaled",
             "ideology_scale_orig","party","sex")]
# store for later
write.csv(feb, "./cleaned-data/flash-eurobarometer.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}

### 15. FNEPS - elite
fneps.elite <- read.dta("./FNEPS/elite/FNEPS-elite.dta")
fneps.elite <- fneps.elite[,c("v4","v8","v1234","v223","v765","v579","v589")]
colnames(fneps.elite) <- c("sampling","won67","won68","ideology67","ideology68",
                           "party","sex")
fneps.elite$id <- row.names(fneps.elite)
mp.in.67 <- fneps.elite[which(fneps.elite$won67 == 1 |
                                fneps.elite$won67 == 3),]
# had to win in 1967
mp.in.67 <- mp.in.67[which(mp.in.67$ideology67 < 96),]
# valid response in 1967 question
mp.in.68 <- fneps.elite[which(fneps.elite$won68 == 1 |
                                fneps.elite$won68 == 2 |
                                fneps.elite$won68 == 8),]
# had to win in 1968
mp.in.68 <- mp.in.68[which(mp.in.68$ideology68 < 96),]
# valid response in 1968 question
# so we don't have enough data for 1968
fneps <- mp.in.67
fneps$country <- rep("France",nrow(fneps))
fneps$year <- rep("1967", nrow(fneps))
fneps <- fneps[,c("country","year","ideology67","party","sex")]
colnames(fneps)[3] <- "ideology"
# clean sex
fneps$sex <- as.character(fneps$sex)
fneps$sex[which(fneps$sex == "1")] <- "Male"
fneps$sex[which(fneps$sex == "2")] <- "Female"
# clean party
fneps$party <- as.character(fneps$party)
fneps$party[which(fneps$party == "0")] <- NA
fneps$party[which(fneps$party == "1")] <- "UNR"
fneps$party[which(fneps$party == "4")] <- "Other"
fneps$party[which(fneps$party == "6")] <- "PDM"
fneps$party[which(fneps$party == "9")] <- NA
fneps$party[which(fneps$party == "10")] <- "CD"
fneps$party[which(fneps$party == "20")] <- "Fifth Republic"
fneps$party[which(fneps$party == "30")] <- "FGDS"
fneps$party[which(fneps$party == "40")] <- "Socialist Party"
fneps$party[which(fneps$party == "50")] <- "PSU"
fneps$party[which(fneps$party == "60")] <- "Communist"
fneps$party[which(fneps$party == "70")] <- "Radical"
fneps$party[which(fneps$party == "80")] <- "Independent Republics"
fneps$party[which(fneps$party == "90")] <- "MRP"
# scale
fneps$ideology_scaled <- rescalr(fneps$ideology, 1, 95, -1, 1)
fneps$ideology_scale_orig <- rep("1-95",nrow(fneps))
fneps$survey <- rep("FNEPS",nrow(fneps))
# clean up
fneps <- fneps[order(fneps$country,fneps$year),
               c("country","year","survey","ideology","ideology_scaled",
                 "ideology_scale_orig","party","sex")]
# store for later
write.csv(fneps, "./cleaned-data/fneps-elite.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 16. Hungarian Election Study (Enyedi)
enyedi <- read.dta("./Enyedi Hungary 2010/H46_en.dta")
enyedi <- enyedi[,c("sorszam","k34","k1")]
colnames(enyedi) <- c("country","ideology","party")
enyedi$country <- rep("Hungary",nrow(enyedi))
enyedi$year <- rep("2010",nrow(enyedi))
enyedi$party <- as.character(enyedi$party)
enyedi$survey <- rep("Hungarian Election Study",nrow(enyedi))
# since all elected in 2010, all legislative term is 2010-2014
enylist <- list(enyedi,enyedi,enyedi,enyedi,enyedi); enyr <- c(2010:2014)
for(i in 1:length(enyr)){
  enylist[[i]]$year <- rep(enyr[i],nrow(enylist[[i]]))
}
enyedi <- do.call(rbind, enylist)
# clean ideology
enyedi$ideology[enyedi$ideology == 99] <- NA
enyedi$ideology_scaled <- rescalr(enyedi$ideology, 0, 10, -1, 1)
enyedi$ideology_scale_orig <- rep("0-10",nrow(enyedi))
# no sex variable
enyedi$sex <- rep(NA,nrow(enyedi))
# clean party
enyedi$party[which(enyedi$party == "DK/DA")] <- NA
enyedi$party[which(enyedi$party == "Not a member")] <- "Independent"
# clean up
enyedi <- enyedi[order(enyedi$country,enyedi$year),
                 c("country","year","survey","ideology","ideology_scaled",
                   "ideology_scale_orig","party","sex")]
# store for later
write.csv(enyedi, "./cleaned-data/enyedi.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 17. Malaise book (Joignant et al. 2016) - elite
load("./Malaise/ARGelites.rdata")
ARGelites$country <- rep("Argentina",nrow(ARGelites))
ARGelites$year <- rep("2014",nrow(ARGelites))
ARGelites <- ARGelites[,c("country","year","P67","cargo","sexo")]
colnames(ARGelites)[c(3:5)] <- c("ideology","job","sex")
# empty party
ARGelites$party <- rep(NA,nrow(ARGelites)) # no codebook
# restrict elites to elected nationally
Sys.setlocale(locale="C")
ARGelites$job[which(grepl("Diputad",ARGelites$job) == T)] <- "0"
ARGelites$job[which(grepl("Legislador",ARGelites$job) == T)] <- "0"
ARGelites$job[which(grepl("Senador",ARGelites$job) == T)] <- "0"
ARGelites <- ARGelites[which(ARGelites$job == "0"),]
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
# we don't have term info but everyone serving at least 2013-2015
arglist <- list(ARGelites,ARGelites,ARGelites); argyear <- c(2013:2015)
for(i in 1:length(argyear)){
  arglist[[i]]$year <- rep(argyear[i], nrow(arglist[[i]]))
}
ARGelites <- do.call(rbind, arglist)
# clean sex
ARGelites$sex <- as.character(ARGelites$sex)
ARGelites$sex[which(ARGelites$sex == "1")] <- "Male"
ARGelites$sex[which(ARGelites$sex == "2")] <- "Female"
# scaling ideology
ARGelites$ideology[ARGelites$ideology == 99] <- NA
ARGelites$ideology_scaled <- rescalr(ARGelites$ideology, 0, 10, -1, 1)
ARGelites$ideology_scale_orig <- rep("0-10",nrow(ARGelites))
ARGelites$survey <- rep("Joignant et al 2016",nrow(ARGelites))
# cleaning up
ARGelites <- ARGelites[order(ARGelites$country,ARGelites$year),
                       c("country","year","survey","ideology","ideology_scaled",
                         "ideology_scale_orig","party","sex")]
# write it out
write.csv(ARGelites, "./cleaned-data/malaise-elites.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 18.-19. PARENEL
files <- list.files(path = "./PARENEL", pattern = "\\.dta$", recursive = T)
files <- paste("./PARENEL/",files,sep="")
data.list <- lapply(files, function(x) read_dta(x))
data.list <- lapply(data.list, function(x) as_factor(x))
data.list <- lapply(data.list, function(x) as.data.frame(x))
data.list[[1]] <- data.list[[1]][,c("Number","v_G21","v_A4_1a","v_R54")]
colnames(data.list[[1]]) <- c("country","ideology","party","sex")
data.list[[1]]$year <- rep("2008",nrow(data.list[[1]]))
data.list[[2]] <- data.list[[2]][,c("Number","v_G25","v_CCS_A4_1","v_R63")]
colnames(data.list[[2]]) <- c("country","ideology","party","sex")
data.list[[2]]$year <- rep("2012",nrow(data.list[[2]]))
for(i in 1:length(data.list)){
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
  }
}
parenel <- do.call(rbind, data.list)
parenel$country <- rep("Portugal",nrow(parenel))
# clean ideology
parenel$ideology[which(parenel$ideology == "Don't know" | 
                         parenel$ideology == "No response")] <- NA
parenel$ideology[which(parenel$ideology == "Left")] <- "0"
parenel$ideology[which(parenel$ideology == "Right")] <- "10"
parenel$ideology <- as.numeric(parenel$ideology)
# add survey
parenel$survey <- rep(NA,nrow(parenel))
parenel$survey[parenel$year == "2008"] <- "PARENEL 2008"
parenel$survey[parenel$year == "2012"] <- "PARENEL 2012"
# code parliaments
parenel$leg.begin <- parenel$leg.end <- rep(NA,nrow(parenel))
parenel$leg.begin[parenel$year == "2008"] <- "2005"
parenel$leg.end[parenel$year == "2008"] <- "2009"
parenel$leg.begin[parenel$year == "2012"] <- "2009"
parenel$leg.end[parenel$year == "2012"] <- "2015"
parenel$year <- NULL
# expand parenel into time-series
parenel.full <- data.frame(matrix(nrow=0,ncol=6))
colnames(parenel.full) <- c("country","year","ideology","survey","party","sex")
for(i in 1:nrow(parenel)){
  tmp <- parenel[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","survey","party","sex")]
  parenel.full <- rbind(parenel.full,tmp)
}
parenel <- parenel.full
# cleaning up
parenel$ideology_scaled <- rescalr(parenel$ideology, 0, 10, -1, 1)
parenel$ideology_scale_orig <- rep("0-10",nrow(parenel))
parenel <- parenel[order(parenel$country,parenel$year),
                   c("country","year","survey","ideology","ideology_scaled",
                     "ideology_scale_orig","party","sex")]
# store for later
write.csv(parenel, "./cleaned-data/parenel.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 20. PartiRep
partirep <- read_spss("./PARTIREP/PartiRep-Jul14.sav")
partirep <- as_factor(partirep)
partirep <- as.data.frame(partirep)
partirep <- partirep[,c("Country","Start_term","End_term",
                        "Parliament","V037_1","Party","Sex")]
colnames(partirep) <- c("country","leg_begin","leg_end","parl","ideology",
                        "party","sex")
partirep$country <- as.character(partirep$country)
partirep$parl <- as.character(partirep$parl)
partirep$ideology <- as.character(partirep$ideology)
# 00 indicates a national legislature, not a sub-national one
partirep$level <- str_extract(partirep$parl,"\\d{2}")
partirep <- partirep[which(partirep$level == "00"),]
partirep <- partirep[,c("country","leg_begin","leg_end","ideology",
                        "party","sex")]
partirep$survey <- rep("PartiRep",nrow(partirep))
# clean sex
partirep$sex <- as.character(partirep$sex)
partirep$sex[which(partirep$sex == "male")] <- "Male"
partirep$sex[which(partirep$sex == "female")] <- "Female"
# clean party
partirep$party <- str_trim(partirep$party, "both")
# expand partirep into time-series
partirep.full <- data.frame(matrix(nrow=0,ncol=6))
colnames(partirep.full) <- c("country","year","ideology","survey","party","sex")
for(i in 1:nrow(partirep)){
  tmp <- partirep[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg_begin:tmp$leg_end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","survey","party","sex")]
  partirep.full <- rbind(partirep.full,tmp)
}
partirep <- partirep.full
# clean variables and save
partirep$country <- gsub("^\\s+|\\s+$", "", partirep$country) # no whitespace
partirep$country[partirep$country == "AUT"] <- "Austria"
partirep$country[partirep$country == "BEL"] <- "Belgium"
partirep$country[partirep$country == "FRA"] <- "France"
partirep$country[partirep$country == "GER"] <- "Germany"
partirep$country[partirep$country == "HUN"] <- "Hungary"
partirep$country[partirep$country == "IRE"] <- "Ireland"
partirep$country[partirep$country == "ISR"] <- "Israel"
partirep$country[partirep$country == "ITA"] <- "Italy"
partirep$country[partirep$country == "NET"] <- "Netherlands"
partirep$country[partirep$country == "NOR"] <- "Norway"
partirep$country[partirep$country == "POL"] <- "Poland"
partirep$country[partirep$country == "POR"] <- "Portugal"
partirep$country[partirep$country == "SPA"] <- "Spain"
partirep$country[partirep$country == "SWI"] <- "Switzerland"
partirep$country[partirep$country == "UNK"] <- "United Kingdom"
# fix ideology
partirep$ideology[which(partirep$ideology == "10: Right")] <- "10"
partirep$ideology[which(partirep$ideology == "0: Left")] <- "0"
partirep$ideology <- as.numeric(partirep$ideology)
partirep$ideology_scaled <- rescalr(partirep$ideology, 0, 10, -1, 1)
partirep$ideology_scale_orig <- rep("0-10",nrow(partirep))
# clean up
partirep <- partirep[order(partirep$country,partirep$year),
                     c("country","year","survey","ideology","ideology_scaled",
                       "ideology_scale_orig","party","sex")]
# store for later
write.csv(partirep, "./cleaned-data/partirep.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

#### 21.-107. PELA/USAL
files <- list.files(path = "./PELA", pattern = "\\.sav$|\\.SAV$")
files <- paste("./PELA/",files,sep="")
# haven 1.1.1 doesn't allocate enough memory to lapply these files
data.list <- vector("list", length(files))
for(i in 1:length(data.list)){
  tryCatch({
    data.list[[i]] <- read_sav(files[i])
    data.list[[i]] <- as_factor(data.list[[i]])
    data.list[[i]] <- as.data.frame(data.list[[i]])
  },
  error=function(cond){
    message(paste("ERROR IN DATASET ",i,sep=""))
    message(paste(cond,"\n",sep=""))
  },
  warning=function(cond){
    message(paste("WARNING IN DATASET ",i,sep=""))
    message(paste(cond,"\n",sep=""))
  })
}
probs <- which(unlist(lapply(data.list, is.null)))
for(i in probs){
  data.list[[i]] <- read.spss(files[i], use.value.labels = TRUE,
                              use.missings = TRUE, to.data.frame = TRUE)
}
pela <- data.frame(matrix(nrow=0,ncol=6))
colnames(pela) <- c("country","leg.session","ideology","party","sex","survey")
# we need to select the relevant questions from each dataset...
# we'll do them individually since they're non-standard
for(i in c(1:3,5:11,13:14,16:18,20,22:23,25:31)){
  data.list[[i]] <- data.list[[i]][,c("nestu","legis","p67","partido","p71")]
}
for(i in c(4,12,15,19,21,24)){
  data.list[[i]] <- data.list[[i]][,c("NESTU","LEGIS","P67","PARTIDO","P71")]
}
for(i in c(32:49)){
  data.list[[i]] <- data.list[[i]][,c("pais","legis","p58","partido","p62")]
}
for(i in c(50:56,58:59,61:63,66)){
  data.list[[i]] <- data.list[[i]][,c("Pais","legis","p64","partido","p67")]
}
for(i in c(57,60)){
  data.list[[i]] <- data.list[[i]][,c("pais","legis","p64","partido","p67")]
}
for(i in c(67:68,72)){
  data.list[[i]] <- data.list[[i]][,c("Pais","legis","ID1","partido","SOCD4")]
}
for(i in c(70:71,73:78,80:81,83:87)){
  data.list[[i]] <- data.list[[i]][,c("pais","legis","ID1","partido","SOCD4")]
}
data.list[[64]] <- data.list[[64]][,c("Pais","legis","id1","partido","socd4")]
data.list[[65]] <- data.list[[65]][,c("pais","legis","id1","partido","socd4")]
data.list[[69]] <- data.list[[69]][,c("País","Per.leg","ID1","Partido","SOCD4")]
data.list[[79]] <- data.list[[79]][,c("pais","legis","ID1","PP","SOCD4")]
data.list[[82]] <- data.list[[82]][,c("Pais","Legis","ID1","Partido","SOCD4")]
# standardize colnames and merge
for(i in 1:length(data.list)){
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
  }
  data.list[[i]]$survey <- rep(files[i],nrow(data.list[[i]]))
  colnames(data.list[[i]]) <- colnames(pela)
  pela <- rbind(pela,data.list[[i]])
}
# clear up memory space
rm(data.list,files,i,j)
# fix country names
pela$country[pela$country == "M\x82xico01"] <- "Mexico"
pela$country[pela$country == "M\xe9xico"] <- "Mexico"
pela$country[pela$country == "Panam\xe1"] <- "Panama"
pela$country[pela$country == "Per\xfa"] <- "Peru"
pela$country[pela$country == "Rep\xfablica Dominicana"] <- "Dominican Republic"
pela$country <- gsub('[0-9]+', '', pela$country)
pela$country <- gsub(" $","", pela$country, perl=T)
pela$country[pela$country == "chile"] <- "Chile"
pela$country[pela$country == "SALV"] <- "El Salvador"
pela$country[pela$country == "CR"] <- "Costa Rica"
pela$country[pela$country == "ECU"] <- "Ecuador"
pela$country[pela$country == "Brasil"] <- "Brazil"
pela$country[pela$country == "Panamá"] <- "Panama"
pela$country[pela$country == "México"] <- "Mexico"
pela$country[pela$country == "M‚xico"] <- "Mexico"
pela$country[pela$country == "Perú"] <- "Peru"
pela$country[pela$country == "República Dominicana"] <- "Dominican Republic"
pela$country[pela$country == "REPUBLICA DOMINICANA"] <- "Dominican Republic"
# a few special cases
pela$country[pela$survey == "./PELA/da02mex.sav"] <- "Mexico"
pela$country[pela$survey == "./PELA/da03chi.sav"] <- "Chile"
pela$country[pela$survey == "./PELA/da06arg.sav"] <- "Argentina"
pela$country[pela$survey == "./PELA/da12hon.sav"] <- "Honduras"
pela$country[pela$survey == "./PELA/da27sal.sav"] <- "El Salvador"
pela$country[pela$survey == "./PELA/da34uru.sav"] <- "Uruguay"
pela$country[pela$survey == "./PELA/da35ven.sav"] <- "Venezuela"
pela$country[pela$survey == "./PELA/da36ven.sav"] <- "Venezuela"
pela$country[pela$survey == "./PELA/da39nic.sav"] <- "Nicaragua"
pela$country[pela$survey == "./PELA/da49par.sav"] <- "Paraguay"
pela$country[pela$survey == "./PELA/da53pan.sav"] <- "Panama"
# fix messed up formatting of legislative sessions
pela$leg.session[pela$leg.session == "99-04"] <- "1999-2004"
pela$leg.session[pela$leg.session == "00-03"] <- "2000-2003"
pela$leg.session[pela$leg.session == "00-04"] <- "2000-2004"
pela$leg.session[pela$leg.session == "02-06"] <- "2002-2006"
pela$leg.session[pela$leg.session == "03-06"] <- "2003-2006"
pela$leg.session[pela$leg.session == "03-07"] <- "2003-2007"
pela$leg.session[pela$leg.session == "04-08"] <- "2004-2008"
pela$leg.session[pela$leg.session == "04-09"] <- "2004-2009"
pela$leg.session[pela$leg.session == "05-09"] <- "2005-2009"
pela$leg.session[pela$leg.session == "05-10"] <- "2005-2010"
pela$leg.session[pela$leg.session == "06-09"] <- "2006-2009"
pela$leg.session[pela$leg.session == "06-10"] <- "2006-2010"
pela$leg.session[pela$leg.session == "06-11"] <- "2006-2011"
pela$leg.session[pela$leg.session == "07-08"] <- "2007-2008"
pela$leg.session[pela$leg.session == "07-09"] <- "2007-2009"
pela$leg.session[pela$leg.session == "07-10"] <- "2007-2010"
pela$leg.session[pela$leg.session == "07-11"] <- "2007-2011"
pela$leg.session[pela$leg.session == "08-12"] <- "2008-2012"
pela$leg.session[pela$leg.session == "08-13"] <- "2008-2013"
pela$leg.session[pela$leg.session == "09-11"] <- "2009-2011"
pela$leg.session[pela$leg.session == "09-12"] <- "2009-2012"
pela$leg.session[pela$leg.session == "09-13"] <- "2009-2013"
pela$leg.session[pela$leg.session == "09-14"] <- "2009-2014"
pela$leg.session[pela$leg.session == "10-14"] <- "2010-2014"
pela$leg.session[pela$leg.session == "13-17"] <- "2013-2017"
pela$leg.session[pela$leg.session == "9397"] <- "1993-1997"
pela$leg.session[pela$leg.session == "9398"] <- "1993-1998"
pela$leg.session[pela$leg.session == "9497"] <- "1994-1997"
pela$leg.session[pela$leg.session == "9498"] <- "1994-1998"
pela$leg.session[pela$leg.session == "9597"] <- "1995-1997"
pela$leg.session[pela$leg.session == "9599"] <- "1995-1999"
pela$leg.session[pela$leg.session == "9500"] <- "1995-2000"
pela$leg.session[pela$leg.session == "9698"] <- "1996-1998"
pela$leg.session[pela$leg.session == "9601"] <- "1996-2001"
pela$leg.session[pela$leg.session == "9700"] <- "1997-2000"
pela$leg.session[pela$leg.session == "9701"] <- "1997-2001"
pela$leg.session[pela$leg.session == "9702"] <- "1997-2002"
pela$leg.session[pela$leg.session == "9802"] <- "1998-2002"
pela$leg.session[pela$leg.session == "9803"] <- "1998-2003"
pela$leg.session[pela$leg.session == "913"] <- "2009-2013"
pela$leg.session[pela$leg.session == "711"] <- "2007-2011"
pela$leg.session[pela$leg.session == "1217"] <- "2012-2017"
pela$leg.session[pela$leg.session == "1318"] <- "2013-2018"
pela$leg.session[pela$leg.session == "2011"] <- "2011-2016"
# some unique cases that are either miscoded, mis-read, etc.
pela$leg.session[pela$survey == "./PELA/da33uru.sav"] <- "2000-2005"
pela$leg.session[pela$survey == "./PELA/da35ven.sav"] <- "2000-2005"
pela$leg.session[pela$survey == "./PELA/da27sal.sav"] <- "2000-2003"
pela$leg.session[pela$survey == "./PELA/da31per.sav"] <- "2001-2006"
pela$leg.session[pela$survey == "./PELA/da47bol.sav"] <- "2002-2005"
pela$leg.session[pela$survey == "./PELA/da75bra.sav"] <- "2006-2010"
# elections held early, so not 2002-2007
pela$leg.session[pela$survey == "./PELA/da49par.sav"] <- "2003-2008"
pela$leg.session[pela$survey == "./PELA/da62bol.sav"] <- "2006-2009"
# elections later held early, so not 2006-2010
pela$leg.session[pela$survey == "./PELA/da67arg.sav"] <- "2007-2011"
# miscode somehow
pela$leg.session[pela$survey == "./PELA/da76uru.sav"] <- "2010-2015"
pela$leg.session[pela$survey == "./PELA/da85gua.sav"] <- "2012-2016"
# create year indicators
pela$leg.begin <- str_sub(pela$leg.session, 0, 4)
pela$leg.end <- str_sub(pela$leg.session, 6, 9)
pela$leg.begin <- as.numeric(pela$leg.begin)
pela$leg.end <- as.numeric(pela$leg.end)
# fix ideology
pela$ideology[which(pela$ideology %in% c("Izquierda (1)", 
                                         "Izquierda", "Esquerda"))] <- "1"
pela$ideology[pela$ideology == "(2)"] <- "2"
pela$ideology[pela$ideology == "(3)"] <- "3"
pela$ideology[pela$ideology == "(4)"] <- "4"
pela$ideology[pela$ideology == "(5)"] <- "5"
pela$ideology[pela$ideology == "(6)"] <- "6"
pela$ideology[pela$ideology == "(7)"] <- "7"
pela$ideology[pela$ideology == "(8)"] <- "8"
pela$ideology[pela$ideology == "(9)"] <- "9"
pela$ideology[which(pela$ideology %in% c("Derecha (10)", 
                                         "Derecha", "Direita"))] <- "10"
pela$ideology[pela$ideology == "99"] <- NA
pela$ideology[pela$ideology == "N.C."] <- NA
pela$ideology[pela$ideology == "N.S."] <- NA
pela$ideology[pela$ideology == "N.R."] <- NA
pela$ideology[pela$ideology == "98"] <- NA
# clean sex
pela$sex[which(pela$sex == "1" | pela$sex == "Hombre" |
                 pela$sex == "Homem")] <- "Male"
pela$sex[which(pela$sex == "2" | pela$sex == "Mujer" |
                 pela$sex == "Mulher")] <- "Female"
pela$sex[which(pela$sex == "4")] <- NA # no idea
# expand pela into time-series
pela.full <- data.frame(matrix(nrow=0,ncol=6))
colnames(pela.full) <- c("country","year","ideology","survey","party","sex")
for(i in 1:nrow(pela)){
  tmp <- pela[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","ideology","survey","party","sex")]
  pela.full <- rbind(pela.full,tmp)
}
pela <- pela.full
# fix survey
pela$survey <- paste("PELA study",str_extract(pela$survey,"\\d{2}"),sep=" ")
# create scaled variable
pela$ideology <- as.numeric(pela$ideology)
pela$ideology_scaled <- rescalr(pela$ideology, 1, 10, -1, 1)
pela$ideology_scale_orig <- rep("1-10",nrow(pela))
# fix country name
pela$country[which(pela$country == "Venezuela")] <- "Venezuela, RB"
# NA out one of the parties which is unknown
pela$party[which(pela$party == "13")] <- NA
# clean up
pela <- pela[order(pela$country,pela$year),
             c("country","year","survey","ideology","ideology_scaled",
               "ideology_scale_orig","party","sex")]
# store for later
write.csv(pela, "./cleaned-data/pela.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 108. Ukraine (Wilson and Birch 1999) - elected deputies
birch <- read.dta("./Birch/stata8/h079d.dta")
birch <- birch[,c("p1","p32")]
colnames(birch) <- c("country","ideology")
birch$country <- rep("Ukraine",nrow(birch))
# ideology
birch$ideology <- as.character(birch$ideology)
birch$ideology[birch$ideology == "leftist"] <- "1"
birch$ideology[birch$ideology == "centre-left"] <- "2"
birch$ideology[birch$ideology == "centrist"] <- "3"
birch$ideology[birch$ideology == "national-democratic"] <- "4"
birch$ideology[birch$ideology == "(right wing) ukrainian-nationalis"] <- "5"
birch$ideology[which(birch$ideology == "d/k" |
                       birch$ideology == "none of the above")] <- NA
# make a panel -- everyone elected 1998-2002
blist <- list(birch,birch,birch,birch,birch); byear <- c(1998:2002)
for(i in 1:length(byear)){
  blist[[i]]$year <- rep(byear[i],nrow(blist[[i]]))
}
birch <- do.call(rbind, blist)
# clean up
birch$ideology <- as.numeric(birch$ideology)
birch$ideology_scaled <- rescalr(birch$ideology, 1, 5, -1, 1)
birch$ideology_scale_orig <- rep("1-5",nrow(birch))
birch$survey <- rep("Birch 1999",nrow(birch))
# no party or sex question
birch$party <- rep(NA,nrow(birch))
birch$sex <- rep(NA,nrow(birch))
# clean up
birch <- birch[order(birch$country,birch$year),
               c("country","year","survey","ideology","ideology_scaled",
                 "ideology_scale_orig","party","sex")]
# store for later
write.csv(birch, "./cleaned-data/birch-elite.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### Sweden Parliamentary Survey
# read in and clean
swel <- read.spss("./Sweden RDU surveys/Super data set for Noam Lupu 2018.sav", 
                  use.value.labels = TRUE, use.missings = TRUE, 
                  to.data.frame = TRUE)
names(swel) <- c("public_sector", "defense_spending", "health_private",
                 "pornography", "working_day", "reduce_inequality", 
                 "ban_driving", "fewer_refugees", "nato", "ideology",
                 "occupation", "sex", "party", "year")
swel$country <- rep("Sweden", nrow(swel))
swel <- swel[,c("country", "year","ideology","party","sex")]
for(i in 1:ncol(swel)){
  swel[,i] <- as.character(swel[,i])
}
# clean ideology
swel$ideology <- gsub("[^\\d]+", "", swel$ideology, perl=TRUE)
swel$ideology <- as.numeric(swel$ideology)
# clean gender
swel$sex[which(swel$sex == "Man")] <- "Male"
swel$sex[which(swel$sex == "Woman")] <- "Female"
# drop 1991 since all blank
swel <- swel[-which(swel$year == "1991"),]
# code legislative terms
swel$leg.begin <- swel$year
swel$leg.end <- rep(NA,nrow(swel))
swel$leg.end[which(swel$leg.begin == "1985")] <- "1988"
swel$leg.end[which(swel$leg.begin == "1988")] <- "1991"
swel$leg.end[which(swel$leg.begin == "1994")] <- "1996"
swel$leg.end[which(swel$leg.begin == "1996")] <- "1998"
swel$leg.end[which(swel$leg.begin == "1998")] <- "2002"
swel$leg.end[which(swel$leg.begin == "2002")] <- "2006"
swel$leg.end[which(swel$leg.begin == "2006")] <- "2010"
swel$leg.end[which(swel$leg.begin == "2010")] <- "2014"
swel$leg.end[which(swel$leg.begin == "2014")] <- "2017"
# code survey waves
swel$survey <- rep(NA, nrow(swel))
swel$survey[which(swel$leg.begin == "1985")] <- "Sweden RDU wave 1"
swel$survey[which(swel$leg.begin == "1988")] <- "Sweden RDU wave 2"
swel$survey[which(swel$leg.begin == "1994")] <- "Sweden RDU wave 4"
swel$survey[which(swel$leg.begin == "1996")] <- "Sweden RDU wave 5"
swel$survey[which(swel$leg.begin == "1998")] <- "Sweden RDU wave 6"
swel$survey[which(swel$leg.begin == "2002")] <- "Sweden RDU wave 7"
swel$survey[which(swel$leg.begin == "2006")] <- "Sweden RDU wave 8"
swel$survey[which(swel$leg.begin == "2010")] <- "Sweden RDU wave 9"
swel$survey[which(swel$leg.begin == "2014")] <- "Sweden RDU wave 10"
# create panel
swel.full <- data.frame(matrix(nrow=0,ncol=8))
names(swel.full) <-  names(swel)
for(i in 1:nrow(swel)){
  tmp <- swel[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  stopifnot(names(tmp) == names(swel.full))
  swel.full <- rbind(swel.full,tmp)
}
swel <- swel.full; rm(swel.full)
swel$year <- as.character(swel$year)
swel$leg.begin <- swel$leg.end <- NULL
# rescale ideology
swel$ideology_scaled <- rescalr(swel$ideology, 0, 10, -1, 1)
swel$ideology_scale_orig <- rep("0-10",nrow(swel))
# clean up
swel <- swel[order(swel$country,swel$year),
             c("country","year","survey","ideology","ideology_scaled",
               "ideology_scale_orig","party","sex")]
# store for later
write.csv(swel, "./cleaned-data/swel.csv", row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### Switzerland Elites
swiss <- read_dta("./Switzerland elite surveys/kerr_allvar_withID.dta")
swiss <- as_factor(swiss)
swiss <- as.data.frame(swiss)
swiss <- swiss[,c("v2","v353","v374","v620","v689")]
names(swiss) <- c("party", "sex", "ideology", "mp71", "mp75")
swiss$country <- rep("Switzerland", nrow(swiss))
swiss$survey <- rep("Iowa CLRC", nrow(swiss))
# clean ideology
swiss$ideology <- as.character(swiss$ideology)
swiss$ideology[which(swiss$ideology %in% c("IT DEPENDS", "DK", "REJECTS EXTREMES",
                                           "REFUSES ANSWER"))] <- NA
swiss$ideology <- as.numeric(swiss$ideology)
# clean party
swiss$party <- as.character(swiss$party)
swiss$party[which(!(swiss$party %in% c("RADICAL DEM", "SOCIALIST", 
                                       "CHRISTIAN DEM", "SWISS POPULIST")))] <- "Other"
# clean sex
swiss$sex <- as.character(swiss$sex)
swiss$sex[which(swiss$sex == "MALE")] <- "Male"
swiss$sex[which(swiss$sex == "FEMALE")] <- "Female"
# make legislative terms
swiss$leg.begin <- rep("1971", nrow(swiss))
swiss$leg.end <- rep("1975", nrow(swiss))
swiss$leg.begin <- ifelse(as.character(swiss$mp71) == "YES",
                          swiss$leg.begin,
                          "1974")
swiss$leg.end <- ifelse(as.character(swiss$mp75) == "NO",
                        swiss$leg.end,
                        "1979")
swiss$mp71 <- swiss$mp75 <- NULL
swiss$leg.begin <- as.numeric(swiss$leg.begin)
swiss$leg.end <- as.numeric(swiss$leg.end)
# create panel
swiss$year <- rep(NA,nrow(swiss))
swiss.full <- data.frame(matrix(nrow=0,ncol=8))
names(swiss.full) <-  names(swiss)
for(i in 1:nrow(swiss)){
  tmp <- swiss[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  stopifnot(names(tmp) == names(swiss.full))
  swiss.full <- rbind(swiss.full,tmp)
}
swiss <- swiss.full; rm(swiss.full)
swiss$year <- as.character(swiss$year)
swiss$leg.begin <- swiss$leg.end <- NULL
# clean up
swiss$ideology <- as.numeric(swiss$ideology)
swiss$ideology_scaled <- rescalr(swiss$ideology, 1, 21, -1, 1)
swiss$ideology_scale_orig <- rep("1-21",nrow(swiss))
# clean up
swiss <- swiss[order(swiss$country,swiss$year),
               c("country","year","survey","ideology","ideology_scaled",
                 "ideology_scale_orig","party","sex")]
# store for later
write.csv(swiss, "./cleaned-data/swiss-elite.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

##### MASS #####

### 1. CSES
files <- list.files(path = "./CSES", pattern = "\\.dta$", recursive = T)
files <- paste("./CSES/",files,sep="")
data.list <- lapply(files, function(x) read.dta(x))
# set up merged data placeholder
cses <- data.frame(matrix(nrow=0,ncol=12))
colnames(cses) <- c("country","year","ideology","gender","income","education",
                    "occupation","info1","info2","info3","info4","wave")
# we need to select the relevant questions from each dataset...
# we'll do them individually since they're non-standard
# note info4 is going to get NA-ed out for waves 1-3, so put a placeholder there
data.list[[1]] <- data.list[[1]][,c("A1006","A1004","A3031","A2002","A2012",
                                    "A2003","A2008","A2023","A2024","A2025",
                                    "A1003")]
data.list[[1]]$wave <- rep("1",nrow(data.list[[1]]))
data.list[[2]] <- data.list[[2]][,c("B1006","B1004","B3045","B2002","B2020",
                                    "B2003","B2011","B3047_1","B3047_2",
                                    "B3047_3","B1003")]
data.list[[2]]$wave <- rep("2",nrow(data.list[[2]]))
data.list[[3]] <- data.list[[3]][,c("C1006","C1004","C3013","C2002","C2020",
                                    "C2003","C2011","C3036_1","C3036_2",
                                    "C3036_3","C1003")]
data.list[[3]]$wave <- rep("3",nrow(data.list[[3]]))
data.list[[4]] <- data.list[[4]][,c("D1006","D1004","D3014","D2002","D2020",
                                    "D2003","D2011","D3025_1_A","D3025_2_A",
                                    "D3025_3_A","D3025_4_A")];
data.list[[4]]$wave <- rep("4",nrow(data.list[[4]]))
for(i in 1:length(files)){
  colnames(data.list[[i]]) <- colnames(cses)
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
    # make sure factors don't carry over
  }
}
cses <- do.call(rbind, data.list)
# clean gender
cses$gender[which(grepl("7|9", cses$gender))] <- NA
cses$gender[which(grepl("FEMALE", cses$gender))] <- "Female"
cses$gender[which(grepl("MALE", cses$gender))] <- "Male"
# clean education
cses$education <-  as.numeric(gsub("\\D*(\\d+).*", "\\1", cses$education))
cses$education[which(cses$education > 8)] <- NA
# fix countries -- no way around this mess
cses$country[which(cses$country == "0080")] <- "Albania"
cses$country[which(cses$country == "0360" |
                     cses$country == "0360. AUSTRALIA")] <- "Australia"
cses$country[which(cses$country == "0400")] <- "Austria"
cses$country[which(cses$country == "0560" |
                     cses$country == "0561. BELGIUM-FLANDERS" |
                     cses$country == "0562. BELGIUM-WALLOON")] <- "Belgium"
cses$country[which(cses$country == "0760")] <- "Brazil"
cses$country[which(cses$country == "1000")] <- "Bulgaria"
cses$country[which(cses$country == "1120" |
                     cses$country == "1120. BELARUS")] <- "Belarus"
cses$country[which(cses$country == "1240" |
                     cses$country == "1240. CANADA")] <- "Canada"
cses$country[which(cses$country == "1502" |
                     cses$country == "1520" |
                     cses$country == "1520. CHILE")] <- "Chile"
cses$country[which(cses$country == "1580" |
                     cses$country == "1580. TAIWAN PROVINCE OF CHINA")] <-
  "Taiwan"
cses$country[which(cses$country == "1910")] <- "Croatia"
cses$country[which(cses$country == "2030" |
                     cses$country == "2030. CZECH REPUBLIC")] <- "Czech Republic"
cses$country[which(cses$country == "2080" |
                     cses$country == "2080. DENMARK")] <- "Denmark"
cses$country[which(cses$country == "2330")] <- "Estonia"
cses$country[which(cses$country == "2460")] <- "Finland"
cses$country[which(cses$country == "2500")] <- "France"
cses$country[which(cses$country == "2760" |
                     cses$country == "2760. GERMANY" |
                     cses$country == "2761" |
                     cses$country == "2762")] <- "Germany"
cses$country[which(cses$country == "3000")] <- "Greece"
cses$country[which(cses$country == "3440" |
                     cses$country == "3440. HONG KONG SPECIAL ADMINISTRATIVE")] <-
  "Hong Kong SAR, China"
cses$country[which(cses$country == "3480" |
                     cses$country == "3480. HUNGARY")] <- "Hungary"
cses$country[which(cses$country == "3520" |
                     cses$country == "3520. ICELAND")] <- "Iceland"
cses$country[which(cses$country == "3720")] <- "Ireland"
cses$country[which(cses$country == "3760" |
                     cses$country == "3760. ISRAEL")] <- "Israel"
cses$country[which(cses$country == "3800")] <- "Italy"
cses$country[which(cses$country == "3920" |
                     cses$country == "3920. JAPAN")] <- "Japan"
cses$country[which(cses$country == "4100" |
                     cses$country == "4100. REPUBLIC OF KOREA")] <- "Korea, Rep."
cses$country[which(cses$country == "4170")] <- "Kyrgyz Republic"
cses$country[which(cses$country == "4280")] <- "Latvia"
cses$country[which(cses$country == "4400. LITHUANIA")] <- "Lithuania"
cses$country[which(cses$country == "4840" |
                     cses$country == "4840. MEXICO")] <- "Mexico"
cses$country[which(cses$country == "4990")] <- "Montenegro"
cses$country[which(cses$country == "5280" |
                     cses$country == "5280. NETHERLANDS")] <- "Netherlands"
cses$country[which(cses$country == "5540" |
                     cses$country == "5542" |
                     cses$country == "5540. NEW ZEALAND")] <- "New Zealand"
cses$country[which(cses$country == "5780" |
                     cses$country == "5780. NORWAY")] <- "Norway"
cses$country[which(cses$country == "6040" |
                     cses$country == "6040. PERU")] <- "Peru"
cses$country[which(cses$country == "6080")] <- "Philippines"
cses$country[which(cses$country == "6160" |
                     cses$country == "6160. POLAND")] <- "Poland"
cses$country[which(cses$country == "6200" |
                     cses$country == "6200. PORTUGAL")] <- "Portugal"
cses$country[which(cses$country == "6420" |
                     cses$country == "6420. ROMANIA")] <- "Romania"
cses$country[which(cses$country == "6430" |
                     cses$country == "6430. RUSSIAN FEDERATION")] <-
  "Russian Federation"
cses$country[which(cses$country == "6880")] <- "Serbia"
cses$country[which(cses$country == "7030")] <- "Slovak Republic"
cses$country[which(cses$country == "7050" |
                     cses$country == "7050. SLOVENIA")] <- "Slovenia"
cses$country[which(cses$country == "7100")] <- "South Africa"
cses$country[which(cses$country == "7240" |
                     cses$country == "7240. SPAIN")] <- "Spain"
cses$country[which(cses$country == "7520" |
                     cses$country == "7520. SWEDEN")] <- "Sweden"
cses$country[which(cses$country == "7560" |
                     cses$country == "7560. SWITZERLAND")] <- "Switzerland"
cses$country[which(cses$country == "7640" |
                     cses$country == "7640. THAILAND")] <- "Thailand"
cses$country[which(cses$country == "7920")] <- "Turkey"
cses$country[which(cses$country == "8040" |
                     cses$country == "8040. UKRAINE")] <- "Ukraine"
cses$country[which(cses$country == "8260" |
                     cses$country == "8260. UNITED KINGDOM: GREAT BRITAIN")] <-
  "United Kingdom"
cses$country[which(cses$country == "8400" |
                     cses$country == "8400. UNITED STATES" |
                     cses$country == "8402")] <- "United States"
cses$country[which(cses$country == "8580")] <- "Uruguay"
# fix ideology
cses$ideology <- as.numeric(gsub("[^\\d]+", "", cses$ideology, perl=TRUE))
cses$ideology[which(cses$ideology > 10)] <- NA
# fix year
cses$year <- as.numeric(gsub("[^\\d]+", "", cses$year, perl=TRUE))
cses$year[which(cses$year == "12002" |
                  cses$year == "22002")] <- "2002" # join 2002 german sample
# fix income
cses$income <- as.numeric(gsub("[^\\d]+", "", cses$income, perl=TRUE))
cses$income[which(cses$income > 5)] <- NA
cses$income <- cses$income - 1 # so that it's 0 and 4, like other data/quantiles
# knowledge/information variables
cses$info1 <- as.numeric(gsub("[^\\d]+", "", cses$info1, perl=TRUE))
cses$info1[which(cses$info1 > 6)] <- NA
cses$info1[which(cses$info1 > 1)] <- 0
cses$info2 <- as.numeric(gsub("[^\\d]+", "", cses$info2, perl=TRUE))
cses$info2[which(cses$info2 > 6)] <- NA
cses$info2[which(cses$info2 > 1)] <- 0
cses$info3 <- as.numeric(gsub("[^\\d]+", "", cses$info3, perl=TRUE))
cses$info3[which(cses$info3 > 6)] <- NA
cses$info3[which(cses$info3 > 1)] <- 0
cses$info4[which(cses$wave != "4")] <- NA
cses$info4 <- as.numeric(gsub("[^\\d]+", "", cses$info4, perl=TRUE))
cses$info4[which(cses$info4 > 6)] <- NA
cses$info4[which(cses$info4 > 1)] <- 0
### clean occupation
cses$ocup <- gsub("[^\\d]+", "", cses$occupation, perl=TRUE)
# get the numeric code
cses$ocup <- as.numeric(gsub('^([0-9]{2})([0-9]+)$', '\\1.\\2', cses$ocup))
# insert decimals so we can just use logical operators
cses$occupation[which(cses$ocup > 96)] <- NA # other, NA, DK, NR, etc
cses$occupation[which(cses$ocup == 96)] <- NA # non-classifiable
cses$occupation[which(cses$ocup < 1)] <- NA # NA and military
cses$occupation[which(cses$ocup == 96.1 |
                        cses$ocup == 96.2)] <- 0 # refuse worker
cses$occupation[which(cses$ocup > 79 & cses$ocup < 96)] <- 0 # mechanical
cses$occupation[which(cses$ocup > 59 & cses$ocup < 80)] <- 1 # skilled manual
cses$occupation[which(cses$ocup > 49 & cses$ocup < 60)] <- 2 # service jobs
cses$occupation[which(cses$ocup > 29 & cses$ocup < 50)] <- 3 # clerks
cses$occupation[which(cses$ocup < 30)] <- 4 # white collar professionals
cses$ocup <- NULL
# income is pre-factored, we have no material wealth data, so only factor
# education and knowledge
# split into each country-year
templist <- split(cses, f = list(cses$country, cses$year))
for(i in length(templist):1){
  if(nrow(templist[[i]]) == 0){
    templist[[i]] <- NULL
  }
}
# factor information
knowledge <- which(grepl("info", colnames(cses)))
for(i in 1:length(templist)){
  print(i)
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  knowmat <- cbind(templist[[i]]$id,templist[[i]][,knowledge])
  colnames(knowmat)[1] <- "id"
  # remove all NA columns
  for(j in ncol(knowmat):2){       # 2 since column 1 is IDs
    if(all(is.na(unique(knowmat[,j]))) == T){
      knowmat <- knowmat[,-j]
    }
  }
  # if no durable goods questions were asked, we skip the country-year
  if(is.null(dim(knowmat)) == T){
    templist[[i]]$knowledge <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","ideology","gender",
                                      "occupation","education","income",
                                      "knowledge","wave")]
    next
  }
  if(dim(knowmat)[2] == 2){
    stopifnot(nrow(templist[[i]]) == nrow(knowmat))
    colnames(knowmat)[2] <- "knowledge"
    knowmat$knowledge[which(knowmat$knowledge == "1")] <- "4"
    templist[[i]]$knowledge <- knowmat$knowledge
    templist[[i]] <- templist[[i]][,c("country","year","ideology","gender",
                                      "occupation","education","income",
                                      "knowledge","wave")]
    next
  }
  #make numeric
  for(j in 2:ncol(knowmat)){
    knowmat[,j] <- factor(as.character(knowmat[,j]), levels=c("0","1"))
  }
  # take only complete cases and factor
  knowmat <- knowmat[complete.cases(knowmat),]
  facs <- MCA(knowmat[,-1], ncp = 5, graph = F)
  # save first factor score
  knowmat$knowledge <- facs$ind$coord[,1]
  # need to make sure scales are in the same direction -- make numeric, sort by
  # sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(knowmat)-1)){
    knowmat[,j] <- as.numeric(as.character(knowmat[,j]))
  }
  knowmat <-knowmat[order(rowSums(knowmat[,2:(ncol(knowmat)-1)]),decreasing=T),]
  if(as.logical(head(knowmat$knowledge, 1) < tail(knowmat$knowledge, 1)) == T){
    knowmat$knowledge <- knowmat$knowledge*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],knowmat, by = "id", all.x = T, all.y = T)
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender",
                                    "occupation","education","income",
                                    "knowledge","id","wave")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$knowledge <- jitter(as.numeric(tmpdf$knowledge), factor=10e-8)
  # jitter to allow quantization
  knowqnt <- as.character(qcut(tmpdf$knowledge,5)) # just to get labels
  stopifnot(mean(tmpdf$knowledge[which(knowqnt == max(as.numeric(knowqnt),
                                                      na.rm=T))]) >
              mean(tmpdf$knowledge[which(knowqnt == min(as.numeric(knowqnt),
                                                        na.rm=T))]))
  # make sure factor levels align
  tmpdf$knowledge <- knowqnt
  tmpdf <- tmpdf[,c("id","knowledge")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$knowledge.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "knowledge.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "knowledge"
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender",
                                    "occupation","education","income",
                                    "knowledge","wave")]
}
# quantize education now
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    templist[[i]]$education <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","ideology","gender",
                                      "occupation","education","income",
                                      "knowledge","wave")]
    next
  } else {
    # give everyone an education quantile.
    # break ties with random assignment through jitter
    # jitter shouldn't matter since it will never separate top and bottom
    # quintiles, which is what we care about
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5))
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
  }
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id")
  templist[[i]]$education.x <- NULL
  names(templist[[i]])[names(templist[[i]]) == 'education.y'] <- "education"
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender",
                                    "occupation","education","income",
                                    "knowledge","wave")]
}
# recombine
cses <- do.call(rbind, templist)
# clean up
cses$survey <- paste("CSES Wave",cses$wave,sep=" ")
cses$ideology_scale_orig <- rep("0-10",nrow(cses))
cses$ideology_scaled <- rescalr(cses$ideology,0,10,-1,1)
cses$wealth <- rep(NA,nrow(cses))
cses <- cses[order(cses$country,cses$year),
             c("country","year","survey","ideology","gender","wealth",
               "occupation","education","income","knowledge","ideology_scaled",
               "ideology_scale_orig")]
write.csv(cses, "./cleaned-data/cses.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 5.-74. Eurobarometer (note Mannheim merge is last in this list)
files <- list.files(path = "./Eurobarometer", pattern = "\\.dta$",
                    recursive = T)
files <- paste("./Eurobarometer/",files,sep="")
# this takes a while -- 12.5Gb of data (we'll reduce to ~600Mb)
data.list <- lapply(files, function(x) read_dta(x))
data.list <- lapply(data.list, function(x) as_factor(x))
data.list <- lapply(data.list, function(x) as.data.frame(x))
for(i in 1:length(data.list)){
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
  }
}
# set up merged data placeholder
eurobarometer <- data.frame(matrix(nrow=0,ncol=29))
colnames(eurobarometer) <- c("country","ideology","gender","income","occupation",
                             "survey","year","education","mat_tv","mat_dvd",
                             "mat_cd","mat_computer","mat_internet","mat_car",
                             "mat_accom_paid","mat_accom_paying",
                             "mat_laptop","mat_tablet","mat_smartphone",
                             "info1","info2","info3","info4","info5",
                             "info6","info7","info8","info9","info10")
# we need to select the relevant questions from each dataset...
# we'll do them individually since they're non-standard (boo, hiss)
data.list[[1]] <- data.list[[1]][,c("V12","V695","V701","V728","V440",
                                    "V4","V3","V699")]
data.list[[1]]$V3 <- rep("2002",nrow(data.list[[1]]))
data.list[[2]] <- data.list[[2]][,c("V12","V413","V419","V447","V423","V4","V3",
                                    "V417","V296","V297","V298","V299","V300")]
data.list[[2]]$V3 <- rep("2002",nrow(data.list[[2]]))
data.list[[3]] <-data.list[[3]][,c("V12","V349","V355","V383","V359","V4","V3",
                                   "V353")]
data.list[[3]]$V3 <- rep("2002",nrow(data.list[[3]]))
data.list[[4]]<-data.list[[4]][,c("V10","V496","V502","V517","V514","V15","V3",
                                  "V501")]
data.list[[4]]$V15 <- rep("EB59.0",nrow(data.list[[4]]))
data.list[[4]]$V3 <- rep("2003",nrow(data.list[[4]]))
data.list[[5]] <-data.list[[5]][,c("V13","V501","V448","V529","V452","V4","V3",
                                   "V505")]
data.list[[5]]$V3 <- rep("2003",nrow(data.list[[5]]))
data.list[[6]] <-data.list[[6]][,c("V13","V486","V492","V520","V496","V4","V3",
                                   "V490")]
data.list[[6]]$V3 <- rep("2003",nrow(data.list[[6]]))
data.list[[7]] <-data.list[[7]][,c("V12","V515","V521","V549","V525","V4","V3",
                                   "V519")]
data.list[[7]]$V3 <- rep("2003",nrow(data.list[[7]]))
data.list[[8]] <-data.list[[8]][,c("V12","V591","V597","V625","V601","V4","V3",
                                   "V595")]
data.list[[8]]$V3 <- rep("2003",nrow(data.list[[8]]))
data.list[[9]] <-data.list[[9]][,c("V12","V494","V500","V528","V504","V4","V3",
                                   "V498")]
data.list[[9]]$V3 <- rep("2003",nrow(data.list[[9]]))
data.list[[10]] <-data.list[[10]][,c("V12","V324","V329","V352","V37","V4","V3",
                                     "V327")]
data.list[[10]]$V3 <- rep("2003",nrow(data.list[[10]]))
data.list[[11]] <- data.list[[11]][,c("V12","V307","V330","V358","V334","V4",
                                      "V3","V328","V151","V152","V153","V154",
                                      "V155","V156","V157","V158","V159","V160")]
data.list[[11]]$V3 <- rep("2004",nrow(data.list[[11]]))
data.list[[12]] <- data.list[[12]][,c("V6","V422","V428","V3","V432","V4",
                                      "V445","V426","V211","V212","V213","V214",
                                      "V215","V216")]
data.list[[13]] <- data.list[[13]][,c("V6","V579","V585","V3","V61","V4","V595",
                                      "V583","V96","V97","V98","V99","V100",
                                      "V101")]
data.list[[14]] <-data.list[[14]][,c("V6","V464","V470","V3","V64","V4","V493","V468")]
data.list[[15]]<-data.list[[15]][,c("V6","V664","V670","V3","V674","V4","V691","V668")]
data.list[[16]]<-data.list[[16]][,c("V6","V361","V367","V3","V371","V4","V386","V365")]
data.list[[17]] <-data.list[[17]][,c("V6","V330","V336","V3","V36","V4","V353","V334")]
data.list[[18]] <- data.list[[18]][,c("V6","V404","V410","V3","V414","V4","V584",
                                      "V408","V431","V432","V433","V434","V435",
                                      "V436","V437","V438","V173","V174",
                                      "V175","V176","V327","V328","V329","V330",
                                      "V331","V332")]
data.list[[19]]<-data.list[[19]][,c("V6","V118","V124","V3","V128","V4","V143","V122")]
data.list[[20]] <-data.list[[20]][,c("V6","V529","V535","V3","V58","V4","V552","V533")]
data.list[[21]] <- data.list[[21]][,c("V6","V433","V439","V3","V71","V4","V3309",
                                      "V437","V221","V222","V223","V224")]
data.list[[22]] <- data.list[[22]][,c("V6","V1053","V807","V3","V1062","V4","V15",
                                      "V1057")]
data.list[[22]]$V15 <- rep("2005",nrow(data.list[[22]]))
data.list[[23]] <- data.list[[23]][,c("V6","V1918","V1924","V3","V1928","V4","V1941",
                                      "V1922")]
data.list[[24]] <- data.list[[24]][,c("V6","V627","V460","V3","V633","V4","V659",
                                      "V631","V649","V650","V651","V652",
                                      "V653","V654","V655","V656","V204",
                                      "V205","V206")]
data.list[[25]] <- data.list[[25]][,c("V6","V3304","V3310","V3","V3314","V4","V3342",
                                      "V3308","V3332","V3333","V3334","V3335",
                                      "V3336","V3337","V3338","V3339","V3023",
                                      "V3024","V3025")]
data.list[[26]] <- data.list[[26]][,c("V6","V652","V579","V3","V343","V4","V684",
                                      "V656","V674","V675","V676","V677",
                                      "V678","V679","V680","V681")]
data.list[[27]] <- data.list[[27]][,c("V6","V346","V352","V3","V356","V4","V378",
                                      "V350")]
data.list[[28]] <- data.list[[28]][,c("V6","V456","V462","V3","V466","V4","V491",
                                      "V460","V481","V482","V483","V484",
                                      "V485","V486","V487","V488","V143",
                                      "V144","V145")]
data.list[[29]] <- data.list[[29]][,c("V6","V326","V123","V3","V127","V4","V356",
                                      "V330","V346","V347","V348","V349",
                                      "V350","V351","V352","V353")]
data.list[[30]] <- data.list[[30]][,c("V6","V2007","V2012","V3","V97","V4","V2039",
                                      "V2010","V2029","V2030","V2031","V2032",
                                      "V2033","V2034","V2035","V2036")]
data.list[[31]] <- data.list[[31]][,c("V6","V720","V726","V3","V730","V4","V755",
                                      "V724","V745","V746","V747","V748",
                                      "V749","V750","V751","V752")]
data.list[[32]] <- data.list[[32]][,c("V6","V542","V548","V3","V552","V4","V579",
                                      "V546","V569","V570","V571","V572",
                                      "V573","V574","V575","V576","V165",
                                      "V166","V167","V168")]
data.list[[33]] <- data.list[[33]][,c("V6","V577","V583","V3","V77","V4","V610",
                                      "V581","V600","V601","V602","V603",
                                      "V604","V605","V606","V607")]
data.list[[34]] <- data.list[[34]][,c("V6","V414","V420","V3","V424","V4","V3968",
                                      "V418","V441","V442","V443","V444",
                                      "V445","V446","V447","V448","V206",
                                      "V207","V208","V209","V290","V291",
                                      "V292")]
data.list[[35]] <- data.list[[35]][,c("V6","V612","V618","V3","V622","V4","V652",
                                      "V616","V635","V636","V637","V638",
                                      "V639","V640","V641","V642")]
data.list[[36]] <- data.list[[36]][,c("V6","V383","V389","V3","V393","V4","V440",
                                      "V387","V430","V431","V432","V433",
                                      "V434","V435","V436","V437")]
data.list[[37]] <- data.list[[37]][,c("V6","V761","V767","V3","V771","V4","V798",
                                      "V765","V788","V789","V790","V791",
                                      "V792","V793","V794","V795","V359",
                                      "V360","V361","V362")]
data.list[[38]] <- data.list[[38]][,c("V6","V664","V670","V3","V477","V4","V699",
                                      "V668","V689","V690","V691","V692",
                                      "V693","V694","V695","V696","V284",
                                      "V285","V286","V287")]
data.list[[39]] <- data.list[[39]][,c("V6","V638","V644","V3","V648","V4","V675",
                                      "V642","V665","V666","V667","V668",
                                      "V669","V670","V671","V672","V368",
                                      "V369","V370","V371")]
data.list[[40]] <- data.list[[40]][,c("V6","V421","V427","V3","V136","V4","V459",
                                      "V425","V444","V445","V446","V447",
                                      "V448","V449","V450","V451")]
data.list[[41]] <- data.list[[41]][,c("COUNTRY","D1","D10","VERSION","D15A",
                                      "SURVEY","P1","VD8","D46_1","D46_2",
                                      "D46_3","D46_4","D46_5","D46_6",
                                      "D46_7","D46_8")]
data.list[[42]] <- data.list[[42]][,c("V6","V577","V584","V3","V588","V4",
                                      "V606","V582","V266","V267",
                                      "V268","V269","V270")]
data.list[[43]] <- data.list[[43]][,c("V6","V382","V423","V3","V427","V4","V455",
                                      "V421","V440","V441","V442","V443",
                                      "V444","V445","V446","V447","V92","V93")]
data.list[[44]] <- data.list[[44]][,c("V6","V548","V555","V3","V559","V4","V581",
                                      "V553","V311","V312","V313","V314")]
data.list[[45]] <- data.list[[45]][,c("V6","V498","V506","V3","V70","V4","V534",
                                      "V504","V519","V520","V521","V522",
                                      "V523","V524","V525","V526")]
data.list[[46]] <- data.list[[46]][,c("V6","V594","V602","V3","V606","V4",
                                      "V626","V600")]
data.list[[47]] <- data.list[[47]][,c("V6","V488","V496","V3","V94","V4","V523",
                                      "V494","V511","V512","V513","V514",
                                      "V515","V516","V517","V518")]
data.list[[48]] <- data.list[[48]][,c("V6","V514","V103","V3","V525","V4","V545",
                                      "V520","V163","V164","V165","V166",
                                      "V167","V168","V169","V170")]
data.list[[49]] <- data.list[[49]][,c("V6","V154","V82","V3","V165","V4","V198",
                                      "V160","V181","V182","V183","V184",
                                      "V185","V186","V187","V188")]
data.list[[50]] <- data.list[[50]][,c("V6","V593","V601","V3","V605","V4","V627",
                                      "V599","V358","V359","V360","V361",
                                      "V362","V363","V364","V365")]
data.list[[51]] <- data.list[[51]][,c("V6","V607","V615","V3","V619","V4","V651",
                                      "V613","V635","V636","V637","V638",
                                      "V639","V640","V641","V642")]
data.list[[52]] <- data.list[[52]][,c("COUNTRY","D1","D10","VERSION","D15A","SURVEY",
                                      "P1","VD8","D46_1","D46_2","D46_3",
                                      "D46_4","D46_5","D46_6","D46_7","D46_8")]
data.list[[53]] <- data.list[[53]][,c("COUNTRY","D1","D10","VERSION","D15A","SURVEY",
                                      "P1","VD8","D46_1","D46_2","D46_3",
                                      "D46_4","D46_5","D46_6","D46_7","D46_8")]
data.list[[54]] <- data.list[[54]][,c("country","d1","d10","version","d15a","survey",
                                      "p1","vd8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_5","d46_6","d46_7","d46_8",
                                      "qp5_1","qp5_2","qp5_3","qp5_4")]
data.list[[55]] <- data.list[[55]][,c("COUNTRY","D1","D10","VERSION","D15A","SURVEY",
                                      "P1","D8","D46_1","D46_2","D46_3","D46_4",
                                      "D46_5","D46_6","D46_7","D46_8","QP7_1",
                                      "QP7_2","QP7_3","QP7_4")]
data.list[[56]] <- data.list[[56]][,c("country","d1","d10","version","d15a",
                                      "survey","p1","d8")]
data.list[[57]] <- data.list[[57]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_8","d46_9","d46_10","d46_11",
                                      "d46_5","d46_6","d46_7",
                                      "qa16_1","qa16_2","qa16_3")]
data.list[[58]] <- data.list[[58]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_5","d46_6","d46_7","d46_8")]
data.list[[59]] <- data.list[[59]][,c("country","d1","d10","version","d15a","survey",
                                      "p1","d8","d46_1","d46_2","d46_3","d46_4",
                                      "d46_8","d46_9","d46_10","d46_11","d46_5",
                                      "d46_6","d46_7")]
data.list[[60]] <- data.list[[60]][,c("country","d1","d10","version","d15a","survey",
                                      "p1","d8","d46_1","d46_2","d46_3","d46_4",
                                      "d46_8","d46_9","d46_10","d46_11","d46_5",
                                      "d46_6","d46_7","qa18_1","qa18_2",
                                      "qa18_3")]
data.list[[61]] <- data.list[[61]][,c("country","d1","d10","version","d15a",
                                      "survey","p1","d8","qp4_1","qp4_2",
                                      "qp4_3","qp4_4")]
data.list[[62]] <- data.list[[62]][,c("country","d1","d10","version","d15a","survey",
                                      "p1","d8","d46_1","d46_2","d46_3","d46_4",
                                      "d46_8","d46_9","d46_10","d46_11","d46_5",
                                      "d46_6","d46_7")]
data.list[[63]] <- data.list[[63]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8")]
data.list[[64]] <- data.list[[64]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_8","d46_9","d46_10","d46_11",
                                      "d46_5","d46_6","d46_7","qa17_1","qa17_2",
                                      "qa17_3")]
data.list[[65]] <- data.list[[65]][,c("country","d1","d10","version","d15a","survey",
                                      "p1","d8","d46_1","d46_2","d46_3","d46_4",
                                      "d46_8","d46_9","d46_10","d46_11","d46_5",
                                      "d46_6","d46_7")]
data.list[[66]] <- data.list[[66]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_8","d46_9","d46_10","d46_11",
                                      "d46_5","d46_6","d46_7","qp11_1","qp11_2",
                                      "qp11_3","qp11_4")]
data.list[[67]] <- data.list[[67]][,c("country","d1","d10","version","d15a","survey",
                                      "p1","d8","d46_1","d46_2","d46_3","d46_4",
                                      "d46_8","d46_9","d46_10","d46_11","d46_5",
                                      "d46_6","d46_7","qb2_1","qb2_2","qb2_3",
                                      "qb2_4")]
data.list[[68]] <- data.list[[68]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_8","d46_9","d46_10","d46_11",
                                      "d46_5","d46_6","d46_7","qa14_1","qa14_2",
                                      "qa14_3")]
data.list[[69]] <- data.list[[69]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_8","d46_9","d46_10","d46_11",
                                      "d46_5","d46_6","d46_7")]
data.list[[70]] <- data.list[[70]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_8","d46_9","d46_10","d46_11",
                                      "d46_5","d46_6","d46_7")]
data.list[[71]] <- data.list[[71]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_8","d46_9","d46_10","d46_11",
                                      "d46_5","d46_6","d46_7")]
data.list[[72]] <- data.list[[72]][,c("country","d1","d10","edition","d15a","survey",
                                      "version","d8","d46_1","d46_2","d46_3",
                                      "d46_4","d46_8","d46_9","d46_10","d46_11",
                                      "d46_5","d46_6","d46_7","qa16_1","qa16_2",
                                      "qa16_3")]
data.list[[73]] <- data.list[[73]][,c("NATION1","LRS","SEX","INCOME","OCCUP","EB",
                                      "YEAR","EDUC")]
## NA-ing out HH income and year info when not available
for(i in c(12:40,42:51)){
  data.list[[i]]$V3 <- rep(NA,nrow(data.list[[i]]))
}
for(i in c(41,52:53,55)){
  data.list[[i]]$VERSION <- rep(NA,nrow(data.list[[i]]))
}
for(i in c(54,56:72)){
  data.list[[i]]$version <- rep(NA,nrow(data.list[[i]]))
}
for(i in c(57:58,63:64,66,68:72)){
  data.list[[i]]$edition <- rep(NA,nrow(data.list[[i]]))
}
# we need to adjust for when we have political knowledge questions
knowledge <- c(2,11,12,13,18,21,24,25,28,32,34,37,38,39,42,43,44,54,55,57,60,61,
               64,66,67,68,72) # in files
# first sync the columns and names for when we don't have knowledge questions
for(i in 1:length(files)){
  if(i %in% knowledge){
    next
  }
  stopifnot(ncol(data.list[[i]]) %in% c(8,16,19))
  if(ncol(data.list[[i]]) == 8){
    data.list[[i]] <- cbind(data.list[[i]],
                            data.frame(matrix(nrow=nrow(data.list[[i]]),ncol=21)))
  }
  if(ncol(data.list[[i]]) == 16){
    data.list[[i]] <- cbind(data.list[[i]],
                            data.frame(matrix(nrow=nrow(data.list[[i]]),ncol=13)))
  }
  if(ncol(data.list[[i]]) == 19){
    data.list[[i]] <- cbind(data.list[[i]],
                            data.frame(matrix(nrow=nrow(data.list[[i]]),ncol=10)))
  }
  colnames(data.list[[i]]) <- colnames(eurobarometer)
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
    # make sure factors don't carry over
  }
}
for(i in 1:length(files)){
  if((i %in% knowledge) == F){
    colnames(data.list[[i]]) <- colnames(eurobarometer)
    next
  }
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
    # make sure factors don't carry over
  }
  # get the columns with knowledge questions
  know.cols <- as.vector(which(str_split(grepl("correct",
                                               data.list[[i]],ignore.case=T)," ") == T))
  if(length(know.cols) == 0){
    know.cols <- as.vector(which(str_split(grepl("true",
                                                 data.list[[i]], ignore.case=T)," ") == T))
  }
  stopifnot(length(know.cols) > 0)
  last.know <- max(know.cols)
  # get the others
  other.cols <-  as.vector(which(str_split(grepl("correct",
                                                 data.list[[i]],ignore.case=T)," ") == F))
  if(length(other.cols) == ncol(data.list[[i]])){
    other.cols <- as.vector(which(str_split(grepl("true",
                                                  data.list[[i]], ignore.case=T)," ") == F))
  }
  last.other <- max(other.cols)
  # fill and bind
  data.list[[i]] <- cbind(data.list[[i]][,other.cols], # basic columns
                          data.frame(matrix(nrow=nrow(data.list[[i]]),
                                            ncol=(19-last.other))),
                          # fill out mat wealth
                          data.list[[i]][,know.cols], # knowledge columns
                          data.frame(matrix(nrow=nrow(data.list[[i]]),
                                            ncol=(10-length(know.cols)))))
  # fill out knowledge
  colnames(data.list[[i]]) <- colnames(eurobarometer)
}
# now join and fill
eurobarometer <- do.call(rbind, data.list)
rm(data.list); gc()
# fix gender
eurobarometer$gender[which(eurobarometer$gender == "FEMALE" | 
                             eurobarometer$gender == "Woman")] <- "Female"
eurobarometer$gender[which(eurobarometer$gender == "MALE" | 
                             eurobarometer$gender == "Man")] <- "Male"
# fix years
eurobarometer$year <- str_extract(eurobarometer$year,"\\d{4}")
eurobarometer$year <- as.numeric(eurobarometer$year)
# fix countries -- coding decisions first
eurobarometer$country <- gsub("^[A-Z]{2} - ","",eurobarometer$country)
# remove two-letter codes for some
eurobarometer$country[which(eurobarometer$country == "GERMANY WEST" |
                              eurobarometer$country == "GERMANY-WEST" |
                              eurobarometer$country == "DE-W - Germany - West")]<-
  "Germany West"
eurobarometer$country[which(eurobarometer$country == "GERMANY EAST" |
                              eurobarometer$country == "GERMANY-EAST" |
                              eurobarometer$country == "DE-E Germany East")] <-
  "Germany East"
eurobarometer$country[which(eurobarometer$country == "Germany West" &
                              eurobarometer$year > 1990)] <- "Germany"
eurobarometer$country[which(eurobarometer$country == "Germany East" &
                              eurobarometer$year > 1990)] <- "Germany"
eurobarometer$country[which(eurobarometer$country == "GREAT BRITAIN" |
                              eurobarometer$country == "NORTHERN IRELAND" |
                              eurobarometer$country == "Great Britain" |
                              eurobarometer$country == "Northern Ireland" |
                              eurobarometer$country == "GB-GBN - Great Britain" |
                              eurobarometer$country == "GB-NIR Northern Ireland")] <-
  "United Kingdom"
eurobarometer$country[which(eurobarometer$country == "CYPRUS (CY-TCC)" |
                              eurobarometer$country == "CYPRUS (TCC)" |
                              eurobarometer$country == "CYPRUS (REPUBLIC)" |
                              eurobarometer$country == "Cyprus (Republic)" |
                              eurobarometer$country == "Cyprus (CY-TCC)" |
                              eurobarometer$country == "Cyprus (TCC)" |
                              eurobarometer$country == "CY-TCC - Cyprus TCC")] <-
  "Cyprus"
eurobarometer$country[which(eurobarometer$country == "MACEDONIA (FYROM)" |
                              eurobarometer$country == "Macedonia (FYROM)" |
                              eurobarometer$country == "Makedonia/FYROM")] <-
  "Macedonia"
eurobarometer$country[which(eurobarometer$country == "SPAIN" |
                              eurobarometer$country == "ES -Spain")] <- "Spain"
eurobarometer$country[which(eurobarometer$country == "NETHERLANDS" |
                              eurobarometer$country == "The Netherlands")] <-
  "Netherlands"
eurobarometer$country[eurobarometer$country == "Germany (West+East)"] <-
  "Germany"
eurobarometer$country[eurobarometer$country == "AUSTRIA"] <- "Austria"
eurobarometer$country[eurobarometer$country == "BELGIUM"] <- "Belgium"
eurobarometer$country[eurobarometer$country == "DENMARK"] <- "Denmark"
eurobarometer$country[eurobarometer$country == "GREECE"] <- "Greece"
eurobarometer$country[eurobarometer$country == "FINLAND"] <- "Finland"
eurobarometer$country[eurobarometer$country == "FRANCE"] <- "France"
eurobarometer$country[eurobarometer$country == "IRELAND"] <- "Ireland"
eurobarometer$country[eurobarometer$country == "ITALY"] <- "Italy"
eurobarometer$country[eurobarometer$country == "LUXEMBOURG"] <- "Luxembourg"
eurobarometer$country[eurobarometer$country == "PORTUGAL"] <- "Portugal"
eurobarometer$country[eurobarometer$country == "SWEDEN"] <- "Sweden"
eurobarometer$country[eurobarometer$country == "BULGARIA"] <- "Bulgaria"
eurobarometer$country[eurobarometer$country == "CZECH REPUBLIC"] <-
  "Czech Republic"
eurobarometer$country[eurobarometer$country == "ESTONIA"] <- "Estonia"
eurobarometer$country[eurobarometer$country == "HUNGARY"] <- "Hungary"
eurobarometer$country[eurobarometer$country == "LATVIA"] <- "Latvia"
eurobarometer$country[eurobarometer$country == "LITHUANIA"] <- "Lithuania"
eurobarometer$country[eurobarometer$country == "MALTA"] <- "Malta"
eurobarometer$country[eurobarometer$country == "NORWAY"] <- "Norway"
eurobarometer$country[eurobarometer$country == "POLAND"] <- "Poland"
eurobarometer$country[eurobarometer$country == "ROMANIA"] <- "Romania"
eurobarometer$country[eurobarometer$country == "SLOVAKIA"] <- "Slovakia"
eurobarometer$country[eurobarometer$country == "SLOVENIA"] <- "Slovenia"
eurobarometer$country[eurobarometer$country == "TURKEY"] <- "Turkey"
eurobarometer$country[eurobarometer$country == "CROATIA"] <- "Croatia"
# fix left-right ideology
eurobarometer$ideology <- gsub("Box ","",eurobarometer$ideology)
eurobarometer$ideology[which(grepl("NA",eurobarometer$ideology) == T)] <- NA
eurobarometer$ideology[eurobarometer$ideology == "DK"] <- NA
eurobarometer$ideology[which(grepl("Refus",eurobarometer$ideology,
                                   ignore.case = T) == T)] <- NA
eurobarometer$ideology[which(grepl("left",eurobarometer$ideology,
                                   ignore.case = T) == T)] <- "1"
eurobarometer$ideology[which(grepl("right",eurobarometer$ideology,
                                   ignore.case = T) == T)] <- "10"
eurobarometer$ideology <- as.numeric(eurobarometer$ideology)
# standardize survey names
eurobarometer$survey[eurobarometer$survey == "EB 66.1; 9-10 2006"] <-
  "Eurobarometer 66.1"
eurobarometer$survey[eurobarometer$survey == "713"] <- "Eurobarometer 71.3"
eurobarometer$survey <- gsub("\\(.*\\)","",eurobarometer$survey)
eurobarometer$survey <- gsub(", .*","",eurobarometer$survey)
eurobarometer$survey <- gsub("; .*","",eurobarometer$survey)
eurobarometer$survey[eurobarometer$survey == "EB59.0"] <- "Eurobarometer 59.0"
# quicker to make a temp variable, vectorizes what would otherwise be a for-loop
eurobarometer$mm <- rep("",nrow(eurobarometer))
eurobarometer$mm[which(grepl("\\.",eurobarometer$survey) == F)] <-
  " (Mannheim merge)"
eurobarometer$survey <- paste(eurobarometer$survey, eurobarometer$mm, sep="")
eurobarometer$mm <- NULL
eurobarometer$survey <- gsub("EB", "Eurobarometer ",eurobarometer$survey)
eurobarometer$survey <- gsub("ECS", "European Community Study ",
                             eurobarometer$survey)
eurobarometer$survey <- gsub(" $","", eurobarometer$survey, perl=T)
# remove trailing space
eurobarometer$survey <- gsub("([0-9]{2})([0-9])", "\\1\\.\\2",
                             eurobarometer$survey) # adding dots to MM EBs
eurobarometer$survey <- gsub(" ([0-9]{1}) ", " \\1\\.0 ",
                             eurobarometer$survey) # adding 0's to single-digits
eurobarometer$survey <- gsub(" ([0-9]{2})([A-Z]{1}) ", " \\1\\.\\2 ",
                             eurobarometer$survey) # adding dots
eurobarometer$survey <- gsub("(Eurobarometer [0-9]{2})( )", "\\1\\.0\\2",
                             eurobarometer$survey) # adding 0's to early EBs
# now we can go back and finish coding the years
eurobarometer$year[which(eurobarometer$survey == "Eurobarometer 62.0" |
                           eurobarometer$survey == "Eurobarometer 62.1")] <-"2004"
eurobarometer$year[which(eurobarometer$survey == "Eurobarometer 67.1" |
                           eurobarometer$survey == "Eurobarometer 67.2")] <-"2007"
eurobarometer$year[which(eurobarometer$survey == "Eurobarometer 71.1" |
                           eurobarometer$survey == "Eurobarometer 72.4")] <-"2009"
eurobarometer$year[which(eurobarometer$survey == "Eurobarometer 73.1" |
                           eurobarometer$survey == "Eurobarometer 73.4" |
                           eurobarometer$survey == "Eurobarometer 74.1" |
                           eurobarometer$survey == "Eurobarometer 74.2" |
                           eurobarometer$survey == "Eurobarometer 74.3")] <-"2010"
eurobarometer$year[which(eurobarometer$survey == "Eurobarometer 75.2")] <-"2011"
eurobarometer$year[which(eurobarometer$survey == "Eurobarometer 81.4" |
                           eurobarometer$survey == "Eurobarometer 82.1")] <-"2014"
eurobarometer$year[which(eurobarometer$survey == "Eurobarometer 83.2" |
                           eurobarometer$survey == "Eurobarometer 83.3" |
                           eurobarometer$survey == "Eurobarometer 84.1" |
                           eurobarometer$survey == "Eurobarometer 84.3" |
                           eurobarometer$survey == "Eurobarometer 84.4")] <-"2015"
eurobarometer$year[which(eurobarometer$survey == "Eurobarometer 85.1" |
                           eurobarometer$survey == "Eurobarometer 85.2" |
                           eurobarometer$survey == "Eurobarometer 85.1OVR")] <-
  "2016"
# fix education -- the really high numbers are implausible as actual ages
# (e.g., over 100k said "97")
eurobarometer$education[which(eurobarometer$education == "DK,NA" |
                                eurobarometer$education == "INAP" |
                                eurobarometer$education == "DK" |
                                eurobarometer$education == "Refusal" |
                                eurobarometer$education == "DK/NA" |
                                eurobarometer$education == "<DK/NA>" |
                                eurobarometer$education == "(Other)" |
                                eurobarometer$education == "NA" |
                                eurobarometer$education == "99" |
                                eurobarometer$education == "98" |
                                eurobarometer$education == "97" |
                                eurobarometer$education == "96" |
                                eurobarometer$education == "NA (see note)")] <- NA
eurobarometer$education[which(grepl("outlier", eurobarometer$education))] <- NA
eurobarometer$education[eurobarometer$education == "STILL STUDYING"] <- "18"
eurobarometer$education[eurobarometer$education == "Still studying"] <- "18"
eurobarometer$education[eurobarometer$education == "Still studying (see note)"] <- "18"
eurobarometer$education[eurobarometer$education == "No full-time education"] <- "0"
eurobarometer$education[eurobarometer$education == "No full time education"] <- "0"
eurobarometer$education[eurobarometer$education == "6 years (see note)"] <- "6"
eurobarometer$education[eurobarometer$education == "1 year"] <- "1"
eurobarometer$education <- gsub(" years", "", eurobarometer$education, fixed=T)
# ugly but given age requirement in sampling, best approx
eurobarometer$education[eurobarometer$education == "22 OR OLDER"] <- "25"
eurobarometer$education[eurobarometer$education == "UP TO 14"] <- "13" # ugly
eurobarometer$education <- as.numeric(eurobarometer$education)
# fix income -- need to standardize across country-years
eurobarometer$income[eurobarometer$income == "96"] <- NA
eurobarometer$income[eurobarometer$income == "97"] <- NA
eurobarometer$income[eurobarometer$income == "98"] <- NA
eurobarometer$income[eurobarometer$income == "99"] <- NA
eurobarometer$income[eurobarometer$income == "DK/Refusal"] <- NA
eurobarometer$income[eurobarometer$income == "DK,NA"] <- NA
eurobarometer$income[eurobarometer$income == "REFUSED"] <- NA
eurobarometer$income[eurobarometer$income == "UNDOCUMENTED"] <- NA
eurobarometer$income[eurobarometer$income == "Inap. (no data for Norway)"] <- NA
eurobarometer$income[eurobarometer$income == "INAP"] <- NA
eurobarometer$income[which(eurobarometer$income == "- -" |
                             eurobarometer$income == "-- (Lowest income quartile)" |
                             eurobarometer$income == "- - (Lowest income quartile)")] <-
  "lowest"
eurobarometer$income[which(eurobarometer$income == "-" |
                             eurobarometer$income == "- (Next to lowest income quartile)")] <-
  "lower"
eurobarometer$income[which(eurobarometer$income == "+" |
                             eurobarometer$income == "+ (Next to highest income quartile)")] <-
  "higher"
eurobarometer$income[which(eurobarometer$income == "+ +" |
                             eurobarometer$income == "++ (Highest income quartile)" |
                             eurobarometer$income == "+ + (Highest income quartile)")] <-
  "highest"
# make material goods data binary
materials <- which(grepl("mat", colnames(eurobarometer)))
for(i in materials){
  eurobarometer[which(eurobarometer[,i] == "Television"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] == "DVD player"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] == "Music CD player"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] == "Computer"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] ==
                        "An Internet connection at home"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] == "A car"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] ==
                        "An apartment/ a house which you have finished payi"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] ==
                        "An apartment\\ a house which you have finished payi"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] ==
                        "An apartment/ a house which you are paying for"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] ==
                        "An apartment\\ a house which you are paying for"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] == "Mentioned"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] == "MENTIONED"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] == "Yes"),i] <- "1"
  eurobarometer[which(eurobarometer[,i] == "No"),i] <- "0"
  eurobarometer[which(eurobarometer[,i] == "Not mentioned"),i] <- "0"
  eurobarometer[which(eurobarometer[,i] == "NOT MENTIONED"),i] <- "0"
  eurobarometer[which(eurobarometer[,i] == "Data not available"),i] <- NA
  eurobarometer[which(eurobarometer[,i] == "DK"),i] <- NA
}
# now we need to fix the political knowledge questions to be correct/incorrect
# rather than true/false
knowledge <- which(grepl("info", colnames(eurobarometer)))
for(i in knowledge){
  eurobarometer[which(eurobarometer[,i] == "DK"),i] <- NA
  eurobarometer[which(eurobarometer[,i] == "NOT MENTIONED"),i] <- NA
  eurobarometer[which(eurobarometer[,i] == "Not mentioned"),i] <- NA
  eurobarometer[which(grepl("Inap.",eurobarometer[,i],fixed=T) == T),i] <- NA
}
# have to hand-code the surveys which didn't include "correct"
eurobarometer$info1[which(eurobarometer$survey == "Eurobarometer 77.4" &
                            eurobarometer$info1 == "True.")] <- "correct"
eurobarometer$info2[which(eurobarometer$survey == "Eurobarometer 77.4" &
                            eurobarometer$info2 == "True.")] <- "correct"
eurobarometer$info3[which(eurobarometer$survey == "Eurobarometer 77.4" &
                            eurobarometer$info3 == "True.")] <- "correct"
eurobarometer$info4[which(eurobarometer$survey == "Eurobarometer 77.4" &
                            eurobarometer$info4 == "False.")] <- "correct"
eurobarometer$info1[which(eurobarometer$survey == "Eurobarometer 79.5" &
                            eurobarometer$info1 == "True.")] <- "correct"
eurobarometer$info2[which(eurobarometer$survey == "Eurobarometer 79.5" &
                            eurobarometer$info2 == "True.")] <- "correct"
eurobarometer$info3[which(eurobarometer$survey == "Eurobarometer 79.5" &
                            eurobarometer$info3 == "True.")] <- "correct"
eurobarometer$info4[which(eurobarometer$survey == "Eurobarometer 79.5" &
                            eurobarometer$info4 == "False.")] <- "correct"
eurobarometer$info1[which(eurobarometer$survey == "Eurobarometer 82.4" &
                            eurobarometer$info1 == "True.")] <- "correct"
eurobarometer$info2[which(eurobarometer$survey == "Eurobarometer 82.4" &
                            eurobarometer$info2 == "True.")] <- "correct"
eurobarometer$info3[which(eurobarometer$survey == "Eurobarometer 82.4" &
                            eurobarometer$info3 == "True.")] <- "correct"
eurobarometer$info4[which(eurobarometer$survey == "Eurobarometer 82.4" &
                            eurobarometer$info4 == "False.")] <- "correct"
eurobarometer$info1[which(eurobarometer$survey == "Eurobarometer 84.3" &
                            eurobarometer$info1 == "True.")] <- "correct"
eurobarometer$info2[which(eurobarometer$survey == "Eurobarometer 84.3" &
                            eurobarometer$info2 == "True.")] <- "correct"
eurobarometer$info3[which(eurobarometer$survey == "Eurobarometer 84.3" &
                            eurobarometer$info3 == "False.")] <- "correct"
# finally, make binary
for(i in knowledge){
  eurobarometer[which(grepl("correct",
                            eurobarometer[,i],ignore.case=T) == T),i] <- "1"
  eurobarometer[which(grepl("[[:alpha:]]",eurobarometer[,i]) == T),i] <- "0"
  eurobarometer[,i] <- as.numeric(as.character(eurobarometer[,i]))
}
# split into each country-year-survey... necessary because some country-years
# have multiple surveys with different knowledge questions
templist <- split(eurobarometer, f = list(eurobarometer$country,
                                          eurobarometer$year,eurobarometer$survey))
templist <- templist[lapply(templist,nrow)>0]
# factor information
for(i in 1:length(templist)){
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  knowmat <- cbind(templist[[i]]$id,templist[[i]][,knowledge])
  colnames(knowmat)[1] <- "id"
  # remove all NA columns
  for(j in ncol(knowmat):2){       # 2 since column 1 is IDs
    if(all(is.na(unique(knowmat[,j]))) == T){
      knowmat <- knowmat[,-j]
    }
  }
  # if no durable goods questions were asked, we skip the country-year
  if(is.null(dim(knowmat)) == T){
    templist[[i]]$knowledge <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                      "occupation","education","income",
                                      "knowledge","mat_tv","mat_dvd","mat_cd",
                                      "mat_computer","mat_internet","mat_car",
                                      "mat_accom_paid","mat_accom_paying",
                                      "mat_laptop","mat_tablet",
                                      "mat_smartphone")]
    next
  }
  #make factor
  for(j in 2:ncol(knowmat)){
    knowmat[,j] <- factor(as.character(knowmat[,j]), levels=c("0","1"))
  }
  # take only complete cases and factor -- sometimes this eliminates all obs,
  # so drop one question in that case
  kmat <- knowmat[complete.cases(knowmat),]
  if(nrow(kmat) > 0){
    facs <- MCA(kmat[,-1], ncp = 5, graph = F)
  } else {
    knames <- colnames(knowmat)
    kmat <- completeFun(knowmat, knames[-length(knames)])
    facs <- MCA(kmat[,-c(1,ncol(kmat))], ncp = 5, graph = F)
  }
  # save first factor score
  kmat$knowledge <- facs$ind$coord[,1]
  knowmat <- kmat
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(knowmat)-1)){
    knowmat[,j] <- as.numeric(as.character(knowmat[,j]))
  }
  knowmat <- knowmat[order(rowSums(knowmat[,
                                           2:(ncol(knowmat)-1)]),decreasing=T),]
  if(as.logical(head(knowmat$knowledge, 1) < tail(knowmat$knowledge, 1)) == T){
    knowmat$knowledge <- knowmat$knowledge*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],knowmat, by = "id", all.x = T, all.y = T)
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "occupation","education","income",
                                    "knowledge","mat_tv","mat_dvd","mat_cd",
                                    "mat_computer","mat_internet","mat_car",
                                    "mat_accom_paid","mat_accom_paying",
                                    "mat_laptop","mat_tablet","mat_smartphone",
                                    "id")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$knowledge <- jitter(as.numeric(tmpdf$knowledge), factor=10e-8)
  # jitter to allow quantization
  knowqnt <- as.character(qcut(tmpdf$knowledge,5)) # just to get labels
  stopifnot(mean(tmpdf$knowledge[which(knowqnt == max(as.numeric(knowqnt),
                                                      na.rm=T))]) >
              mean(tmpdf$knowledge[which(knowqnt == min(as.numeric(knowqnt),
                                                        na.rm=T))]))
  # make sure factor levels align
  tmpdf$knowledge <- knowqnt
  tmpdf <- tmpdf[,c("id","knowledge")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$knowledge.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "knowledge.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "knowledge"
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "occupation","education","income",
                                    "knowledge","mat_tv","mat_dvd","mat_cd",
                                    "mat_computer","mat_internet","mat_car",
                                    "mat_accom_paid","mat_accom_paying",
                                    "mat_laptop","mat_tablet","mat_smartphone")]
}
eurobarometer <- do.call(rbind, templist)
# now factor material wealth
templist <- split(eurobarometer, f = list(eurobarometer$country,
                                          eurobarometer$year))
templist <- templist[lapply(templist,nrow)>0]
materials <- which(grepl("mat", colnames(eurobarometer)))
for(i in 1:length(templist)){
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  matgoods <- cbind(templist[[i]]$id,templist[[i]][,materials])
  colnames(matgoods)[1] <- "id"
  # remove all NA columns
  for(j in ncol(matgoods):2){                         # 2 since column 1 is IDs
    if(all(is.na(unique(matgoods[,j]))) == T){
      matgoods <- matgoods[,-j]
    }
  }
  # if no durable goods questions were asked, we skip the country-year
  if(is.null(dim(matgoods)) == T){
    templist[[i]]$wealth <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                      "wealth","occupation","education",
                                      "income","knowledge")]
    templist[[i]]$wealth_scale_orig <- rep(NA,nrow(templist[[i]]))
    templist[[i]]$wealth_scaled <- rep(NA,nrow(templist[[i]]))
    next
  }
  #make numeric
  for(j in 2:ncol(matgoods)){
    matgoods[,j] <- factor(as.character(matgoods[,j]), levels=c("0","1"))
  }
  # take only complete cases and factor
  matgoods <- matgoods[complete.cases(matgoods),]
  facs <- MCA(matgoods[,-1], ncp = 5, graph = F)
  # save first factor score
  matgoods$wealth <- facs$ind$coord[,1]
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(matgoods)-1)){
    matgoods[,j] <- as.numeric(as.character(matgoods[,j]))
  }
  matgoods <- matgoods[order(rowSums(matgoods[,2:(ncol(matgoods)-1)]),
                             decreasing=T),]
  if(as.logical(head(matgoods$wealth, 1) < tail(matgoods$wealth, 1)) == T){
    matgoods$wealth <- matgoods$wealth*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],matgoods, by = "id", all.x = T,
                         all.y = T)
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "wealth","occupation","education",
                                    "income","knowledge","id")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$wealth <- jitter(as.numeric(tmpdf$wealth), factor=10e-8)
  # jitter to allow quantization
  matqnt <- as.character(qcut(tmpdf$wealth,5)) # just to get labels
  stopifnot(mean(tmpdf$wealth[which(matqnt == max(as.numeric(matqnt),
                                                  na.rm=T))]) >
              mean(tmpdf$wealth[which(matqnt == min(as.numeric(matqnt),
                                                    na.rm=T))]))
  # make sure factor levels align
  tmpdf$wealth <- matqnt
  tmpdf <- tmpdf[,c("id","wealth")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$wealth.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "wealth.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "wealth"
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "wealth","occupation","education",
                                    "income","knowledge")]
}
# quantize income into factors
prefacs <- c("Eurobarometer 58.0", "Eurobarometer 58.1", "Eurobarometer 58.2",
             "Eurobarometer 59.0", "Eurobarometer 59.1", "Eurobarometer 59.2",
             "Eurobarometer 60.0", "Eurobarometer 60.1", "Eurobarometer 60.2",
             "Eurobarometer 60.3", "Eurobarometer 61.0")
for(i in 1:length(templist)){
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","income","survey")]
  if(all(is.na(unique(templist[[i]]$income))) == T){
    templist[[i]]$income <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                      "wealth","occupation","education",
                                      "income","knowledge")]
    next
  } else {
    # give everyone an income quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about.
    # first, reclassify
    tmpdf_prefac <- tmpdf[which(tmpdf$survey %in% prefacs),]
    tmpdf_needfac <- tmpdf[!(tmpdf$survey %in% prefacs),]
    if(length(unique(as.character(tmpdf_prefac$income))) >= 3){
      tmpdf_prefac$income <- factor(tmpdf_prefac$income,
                                    levels=c("lowest","lower",
                                             "higher","highest"),
                                    labels = c("0","1","3","4"))
      # making quartiles into quantiles by omitting middle class
    }
    if(length(unique(as.character(tmpdf_needfac$income))) >= 3){
      # this removes all NA values, and places where there's not enough
      # variation for quantiles
      tmpdf_needfac$income <- jitter(as.numeric(tmpdf_needfac$income),
                                     factor=10e-8)
      incqnt <- as.character(qcut(tmpdf_needfac$income,5)) # just to get labels
      stopifnot(mean(tmpdf_needfac$income[which(incqnt ==max(as.numeric(incqnt),
                                                             na.rm=T))]) >
                  mean(tmpdf_needfac$income[which(incqnt ==min(as.numeric(incqnt),
                                                               na.rm=T))]))
      # make sure factor levels align
      tmpdf_needfac$income <- incqnt
    }
    tmpdf_needfac$income <- as.character(tmpdf_needfac$income)
    tmpdf_prefac$income <- as.character(tmpdf_prefac$income)
    tmpdf <- rbind(tmpdf_needfac,tmpdf_prefac)
    templist[[i]] <- merge(templist[[i]], tmpdf, by = "id")
    templist[[i]]$survey.y <- templist[[i]]$income.x <- templist[[i]]$id <- NULL
    names(templist[[i]])[names(templist[[i]]) == 'survey.x'] <- "survey"
    names(templist[[i]])[names(templist[[i]]) == 'income.y'] <- "income"
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                      "wealth","occupation","education",
                                      "income","knowledge")]
  }
}
# finally, education
for(i in 1:length(templist)){
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    templist[[i]]$education <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                      "wealth","occupation","education",
                                      "income","knowledge")]
    next
  } else {
    # give everyone an income quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about.
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5))
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
  }
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id")
  templist[[i]]$education.x <- NULL
  names(templist[[i]])[names(templist[[i]]) == 'education.y'] <- "education"
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "wealth","occupation","education",
                                    "income","knowledge")]
}
# recombine
eurobarometer <- do.call(rbind, templist)
### code occupation
eurobarometer$occupation[which(grepl("inap",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- NA
eurobarometer$occupation[which(grepl("DK",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- NA
eurobarometer$occupation[which(grepl("refusal",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- NA
eurobarometer$occupation[which(grepl("unempl",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- NA
eurobarometer$occupation[which(grepl("retired",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- NA
eurobarometer$occupation[which(grepl("milit",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- NA
eurobarometer$occupation[which(grepl("lawyer",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "4"
eurobarometer$occupation[which(grepl("unskill",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "0"
eurobarometer$occupation[which(grepl("fisher",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "0"
eurobarometer$occupation[which(grepl("farmer",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "0"
eurobarometer$occupation[which(grepl("never",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "0"
eurobarometer$occupation[which(grepl("skill",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "2"
eurobarometer$occupation[which(grepl("manual",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "0"
eurobarometer$occupation[which(grepl("student",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "1"
eurobarometer$occupation[which(grepl("employed",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "2"
eurobarometer$occupation[which(grepl("middle",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "3"
eurobarometer$occupation[which(grepl("general",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "3"
eurobarometer$occupation[which(grepl("propri",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "4"
eurobarometer$occupation[which(grepl("owner",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "3"
eurobarometer$occupation[which(grepl("Professional",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "4"
eurobarometer$occupation[which(grepl("white",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "4"
eurobarometer$occupation[which(grepl("supervisor",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "3"
eurobarometer$occupation[which(grepl("resp",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "1"
eurobarometer$occupation[which(grepl("unskd",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "0"
eurobarometer$occupation[which(grepl("office",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "1"
eurobarometer$occupation[which(grepl("prop",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- "3"
eurobarometer$occupation[which(grepl("NA",eurobarometer$occupation,
                                     ignore.case = T) == T)] <- NA
eurobarometer$occupation <- as.numeric(eurobarometer$occupation)
# save original scales
eurobarometer$ideology_scale_orig <- rep("1-10",nrow(eurobarometer))
# fix a few country names
eurobarometer$country[which(eurobarometer$country == "Slovakia")] <-
  "Slovak Republic"
eurobarometer$country[which(eurobarometer$country == "Macedonia")] <-
  "Macedonia, FYR"
# scale everything
eurobarometer$ideology_scaled <- rescalr(eurobarometer$ideology, 1, 10, -1, 1)
# clean up
eurobarometer$year <- as.numeric(eurobarometer$year)
eurobarometer$income <- as.numeric(eurobarometer$income)
eurobarometer$wealth <- as.numeric(eurobarometer$wealth)
eurobarometer$knowledge <- as.numeric(eurobarometer$knowledge)
eurobarometer$education <- as.numeric(eurobarometer$education)
eurobarometer <- eurobarometer[order(eurobarometer$country,eurobarometer$year),
                               c("country","year","survey","ideology","gender","wealth",
                                 "occupation","education","income","knowledge",
                                 "ideology_scaled","ideology_scale_orig")]
write.csv(eurobarometer, "./cleaned-data/eurobarometer.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 75. FNEPS - mass
fneps <-  read.dta("./FNEPS/mass/FNEPS-mass.dta")
fneps <- fneps[,c("v79","v366","v363","v333","v378")]
colnames(fneps) <- c("ideology","gender","education","occupation","income")
# only gender needs cleaning
fneps$gender[which(fneps$gender == "1")] <- "Male"
fneps$gender[which(fneps$gender == "2")] <- "Female"
# quantize income into factors
fneps$income <- jitter(as.numeric(fneps$income), factor=10e-8)
fneps$income <- as.character(qcut(fneps$income,5))
# quantize education
fneps$education <- jitter(as.numeric(fneps$education), factor=10e-8)
fneps$education <- as.character(qcut(fneps$education,5))
# dichotomize occupation
fneps$occ <- rep(NA,nrow(fneps))
fneps$occ[which(fneps$occupation > 4)] <- "worker"
fneps$occ[which(fneps$occupation < 5)] <- "professional"
fneps$occupation <- factor(fneps$occ, levels = c("worker","professional"))
# clean up
fneps$wealth <- rep(NA,nrow(fneps))
fneps$knowledge <- rep(NA,nrow(fneps))
fneps$survey <- rep("FNEPS",nrow(fneps))
fneps$ideology_scaled <- rescalr(fneps$ideology, 1, 95, -1, 1)
fneps$ideology_scale_orig <- rep("1-95",nrow(fneps))
fneps$country <- rep("France",nrow(fneps))
fneps$year <- rep("1967",nrow(fneps))
fneps <- fneps[order(fneps$country,fneps$year),
               c("country","year","survey","ideology","gender","wealth",
                 "occupation","education","income","knowledge","ideology_scaled",
                 "ideology_scale_orig")]
# save for later
write.csv(fneps, "./cleaned-data/fneps-mass.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 76. Malaise book (Joignant et al. 2016) - mass
load("./Malaise/ARGcitizens.rdata")
ARGcitizens$country <- rep("Argentina",nrow(ARGcitizens))
ARGcitizens$year <- rep("2014",nrow(ARGcitizens))
ARGcitizens <- ARGcitizens[,c("country","year", "P41","P61","P65","P85","P44_1B",
                              "P44_2B","P44_3B","P44_4B")] # P85 should be "P87"
colnames(ARGcitizens)[3:6] <- c("ideology","gender","education","income")
# no wealth or occupation data
# clean gender
ARGcitizens$gender[which(ARGcitizens$gender == "1")] <- "Male"
ARGcitizens$gender[which(ARGcitizens$gender == "2")] <- "Female"
# cleaning
ARGcitizens$ideology[ARGcitizens$ideology == 99] <- NA
ARGcitizens$education[ARGcitizens$education == 99] <- NA
# quantizing education
ARGcitizens$education <- jitter(ARGcitizens$education, factor=10e-8)
eduqnt <- as.character(qcut(ARGcitizens$education,5))
if(mean(ARGcitizens$education[which(eduqnt == max(as.numeric(eduqnt),
                                                  na.rm=T))]) <=
   mean(ARGcitizens$education[which(eduqnt == min(as.numeric(eduqnt),
                                                  na.rm=T))])){
  warning("Quantizing has gone awry -- higher values means less educated,
          Deleting eduqnt so you don't mess up.")
  rm(eduqnt)
}
ARGcitizens$education <- eduqnt
# quantizing income
ARGcitizens$income[ARGcitizens$income > 11] <- NA # 99 = NA, one miscode at =12.
ARGcitizens$income <- jitter(ARGcitizens$income, factor=10e-8)
incqnt <- as.character(qcut(ARGcitizens$income,5))
if(mean(ARGcitizens$income[which(incqnt == max(as.numeric(incqnt),
                                               na.rm=T))]) <=
   mean(ARGcitizens$income[which(incqnt == min(as.numeric(incqnt),
                                               na.rm=T))])){
  warning("Quantizing has gone awry -- higher values means less educated,
          Deleting eduqnt so you don't mess up.")
  rm(incqnt)
}
ARGcitizens$income <- incqnt
# clean the knowledge questions -- 1 is correct for all of them
knowledge <- which(grepl("P44", colnames(ARGcitizens)))
for(i in knowledge){
  ARGcitizens[,i] <- as.character(ARGcitizens[,i])
  ARGcitizens[which(ARGcitizens[,i] == "2"),i] <- "0"
  ARGcitizens[,i] <- factor(as.character(ARGcitizens[,i]), levels=c("0","1"))
}
# MCA on knowledge questions
facs <- MCA(ARGcitizens[,knowledge], ncp=5, graph=F)
ARGcitizens$knowledge <- facs$ind$coord[,1]
# quantize
ARGcitizens$knowledge <- jitter(ARGcitizens$knowledge, factor=10e-8)
knwqnt <- as.character(qcut(ARGcitizens$knowledge,5))
# need to make sure scales are in the same direction -- make numeric,
# sort by sum of 1's, then flip axes as necessary
for(i in knowledge){
  ARGcitizens[,i] <- as.numeric(as.character(ARGcitizens[,i]))
}
ARGcitizens$knowtest <- rowSums(ARGcitizens[,knowledge])
if(mean(ARGcitizens$knowtest[which(knwqnt == max(as.numeric(knwqnt),
                                                 na.rm=T))]) <=
   mean(ARGcitizens$knowtest[which(knwqnt == min(as.numeric(knwqnt),
                                                 na.rm=T))])){
  warning("Quantizing has gone awry -- higher values means less informed,
          Deleting knwqnt so you don't mess up.")
  rm(knwqnt)
}
ARGcitizens$knowledge <- knwqnt
# scaling ideology
ARGcitizens$ideology_scaled <- rescalr(ARGcitizens$ideology, 0, 10, -1, 1)
ARGcitizens$ideology_scale_orig <- rep("0-10",nrow(ARGcitizens))
ARGcitizens$survey <- rep("Joignant et al 2016",nrow(ARGcitizens))
# cleaning up
ARGcitizens$wealth <- ARGcitizens$occupation <- rep(NA,nrow(ARGcitizens))
ARGcitizens <- ARGcitizens[order(ARGcitizens$country,ARGcitizens$year),
                           c("country","year","survey","ideology","gender","wealth",
                             "occupation","education","income","knowledge",
                             "ideology_scaled","ideology_scale_orig")]
# store for later
write.csv(ARGcitizens, "./cleaned-data/malaise-citizens.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 77. JGSS
files <- list.files(path = "./JGSS", pattern = "\\.dta$", recursive = T)
files <- paste("./JGSS/",files,sep="")
data.list <- lapply(files, function(x) read.dta(x, convert.factors = F))
# get vars we need
data.list[[1]] <- data.list[[1]][,c("RYEAR","XXJOB","SZINCOMX",
                                    "XXLSTSCH","OP5RADCA","SEXA")]
data.list[[2]] <- data.list[[2]][,c("SIZE","XXJOB","SZINCOMX",
                                    "XXLSTSCH","OP5RADCA","SEXA")]
data.list[[3]] <- data.list[[3]][,c("SIZE","XXJOB","SZINCOMX",
                                    "XXLSTSCH","OP5RADCA","SEXA")]
# 10-12 becomes 10-13, with 14 and 99 NA's
data.list[[4]] <- data.list[[4]][,c("SIZE","XXJOB","SZINCOMX",
                                    "XXLSTSCH","OP5RADCA","SEXA")]
data.list[[5]] <- data.list[[5]][,c("SIZE","XXJOB","SZINCOMX",
                                    "XXLSTSCH","OP5RADCA","SEXA")]
data.list[[1]]$RYEAR <- as.character(data.list[[1]]$RYEAR)
data.list[[1]]$survey <- rep("JGSS 2000-2003",nrow(data.list[[1]]))
data.list[[2]]$SIZE <- rep("2005", nrow(data.list[[2]]))
data.list[[2]]$survey <- rep("JGSS 2005",nrow(data.list[[2]]))
data.list[[3]]$SIZE <- rep("2006", nrow(data.list[[3]]))
data.list[[3]]$survey <- rep("JGSS 2006",nrow(data.list[[3]]))
data.list[[4]]$SIZE <- rep("2008", nrow(data.list[[4]]))
data.list[[4]]$survey <- rep("JGSS 2008",nrow(data.list[[4]]))
data.list[[5]]$SIZE <- rep("2010", nrow(data.list[[5]]))
data.list[[5]]$survey <- rep("JGSS 2010",nrow(data.list[[5]]))
for(i in 1:length(data.list)){
  colnames(data.list[[i]]) <- c("year","occupation","income","education",
                                "ideology","gender","survey")
}
# join
jgss <- do.call(rbind, data.list)
# clean a bit
jgss$gender[which(jgss$gender == "1")] <- "Male"
jgss$gender[which(jgss$gender == "2")] <- "Female"
jgss$country <- rep("Japan", nrow(jgss))
jgss$ideology[jgss$ideology == 9] <- NA
jgss$ideology <- jgss$ideology*(-1) + 6 # invert scale
jgss$ideology_scaled <- rescalr(jgss$ideology, 1, 5, -1, 1)
jgss$ideology_scale_orig <- rep("1-5", nrow(jgss))
# fix schooling info to be ordinal
jgss.old <- jgss[which(jgss$year < 2006),]
jgss.new <- jgss[which(jgss$year > 2005),]
jgss.old$education[which(jgss.old$education == 1 | jgss.old$education == 2 |
                           jgss.old$education == 3)] <- 8
jgss.old$education[which(jgss.old$education == 4 | jgss.old$education == 5 |
                           jgss.old$education == 6)] <- 9
jgss.old$education[which(jgss.old$education == 7)] <- 11
jgss.old$education[which(jgss.old$education == 99 |
                           jgss.old$education == 13)] <- NA
jgss.new$education[which(jgss.new$education == 1 | jgss.new$education == 2 |
                           jgss.new$education == 3)] <- 8
jgss.new$education[which(jgss.new$education == 4 | jgss.new$education == 5 |
                           jgss.new$education == 6)] <- 9
jgss.new$education[which(jgss.new$education == 7)] <- 12
jgss.new$education[which(jgss.new$education == 99 |
                           jgss.new$education == 14)] <- NA
jgss <- rbind(jgss.old,jgss.new)
# fix income
jgss$income[jgss$income > 19] <- NA
jgss$wealth <- rep(NA,nrow(jgss))
jgss <- jgss[order(jgss$country,jgss$year),
             c("country","year","survey","ideology","gender","wealth","occupation",
               "education","income","ideology_scaled","ideology_scale_orig")]
# split into each country-year
templist <- split(jgss, f = list(jgss$country, jgss$year))
for(i in length(templist):1){
  if(nrow(templist[[i]]) == 0){
    templist[[i]] <- NULL
  }
}
# factor income
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","income")]
  if(all(is.na(unique(templist[[i]]$income))) == T){
    templist[[i]]$income <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology",
                                      "gender","wealth","occupation","education",
                                      "income","ideology_scaled",
                                      "ideology_scale_orig")]
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$income <- jitter(as.numeric(tmpdf$income), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$income,5))
    stopifnot(mean(tmpdf$income[which(eduqnt == max(as.numeric(eduqnt),
                                                    na.rm=T))]) >
                mean(tmpdf$income[which(eduqnt == min(as.numeric(eduqnt),
                                                      na.rm=T))]))
    # make sure factor levels align
    tmpdf$income <- eduqnt
  }
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id")
  templist[[i]]$income.x <- NULL
  names(templist[[i]])[names(templist[[i]]) == 'income.y'] <- "income"
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "wealth","occupation","education","income",
                                    "ideology_scaled","ideology_scale_orig")]
}
# factor education
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    templist[[i]]$education <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology",
                                      "gender","wealth","occupation","education",
                                      "income","ideology_scaled",
                                      "ideology_scale_orig")]
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5))
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
  }
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id")
  templist[[i]]$education.x <- NULL
  names(templist[[i]])[names(templist[[i]]) == 'education.y'] <- "education"
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "wealth","occupation","education","income",
                                    "ideology_scaled","ideology_scale_orig")]
}
# recombine
jgss <- do.call(rbind, templist)
### clean occupation data
jgss$ocup <- jgss$occupation
jgss$ocup[which(jgss$occupation < 553)] <- "4"
jgss$ocup[which(jgss$occupation == 515)] <- "2"
jgss$ocup[which(jgss$occupation == 520 | jgss$occupation == 521 |
                  jgss$occupation == 522 | jgss$occupation == 523 |
                  jgss$occupation == 537 | jgss$occupation == 543)] <- "2"
jgss$ocup[which(jgss$occupation > 552 & jgss$occupation < 569)] <- "3"
jgss$ocup[which(jgss$occupation > 568 & jgss$occupation < 599)] <- "1"
jgss$ocup[which(jgss$occupation > 598 & jgss$occupation < 689)] <- "0"
jgss$ocup[which(jgss$occupation == 689)] <- NA
jgss$ocup[which(jgss$occupation == 690)] <- "3"
jgss$ocup[which(jgss$occupation == 691)] <- "4"
jgss$ocup[which(jgss$occupation == 701)] <- "1"
jgss$ocup[which(jgss$occupation == 702)] <- "0"
jgss$ocup[which(jgss$occupation == 703)] <- "2"
jgss$ocup[which(jgss$occupation == 704)] <- "0"
jgss$ocup[which(jgss$occupation == 705)] <- "1"
jgss$ocup[which(jgss$occupation > 900)] <- NA
jgss$occupation <- jgss$ocup
jgss$ocup <- NULL
# no knowledge questions
jgss$knowledge <- rep(NA,nrow(jgss))
# store for later
jgss <- jgss[order(jgss$country,jgss$year),
             c("country","year","survey","ideology","gender","wealth","occupation",
               "education","income","knowledge","ideology_scaled",
               "ideology_scale_orig")]
write.csv(jgss, "./cleaned-data/jgss.csv", row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 78. LAPOP
### read in the data
lapop <- read_dta("./LAPOP/AmericasBarometer Grand Merge 2004-2014.dta", 
                  encoding = "latin1")
lapop <- as_factor(lapop)
lapop <- as.data.frame(lapop)

### trim to the variables we need then make strings
lapop$income <- lapop$q10
lapop$country <- lapop$pais
lapop$ideology <- lapop$l1
lapop$education <- lapop$ed
lapop$occupation <- lapop$ocup1
lapop$gender <- lapop$q1
lapop <- lapop[,c("country", "year", "ideology", "gender", "education", 
                  "occupation", "income", "gi1", "gi2", "gi3", "gi4", "gi5", 
                  "gi7r", "gix4", "r1", "r2", "r3", "r4", "r4a", "r4b", "r5", 
                  "r6", "r7", "r8", "r12", "r13", "r14", "r15", "r16", "r17", 
                  "r18", "r20", "r21", "r22",  "r23", "r24", "r25", "r26")]
for(i in 1:ncol(lapop)){
  lapop[,i] <- as.character(lapop[,i])
}
# fix country names
lapop$country[lapop$country == "Bahamas"] <- "Bahamas, The"
lapop$country[lapop$country == "Belice"] <- "Belize"
lapop$country[lapop$country == "Brasil"] <- "Brazil"
lapop$country[lapop$country == "Canadá"] <- "Canada"
lapop$country[lapop$country == "Estados Unidos"] <- "United States"
lapop$country[lapop$country == "Haití"] <- "Haiti"
lapop$country[lapop$country == "México"] <- "Mexico"
lapop$country[lapop$country == "Panamá"] <- "Panama"
lapop$country[lapop$country == "Perú"] <- "Peru"
lapop$country[lapop$country == "República Dominicana"] <- "Dominican Republic"
lapop$country[lapop$country == "Surinam"] <- "Suriname"
lapop$country[lapop$country == "Trinidad y Tobago"] <- "Trinidad and Tobago"
lapop$country[lapop$country == "Venezuela"] <- "Venezuela, RB"
# fix gender
lapop$gender[which(lapop$gender %in% c("No Responde", "No Sabe"))] <- NA
lapop$gender[which(lapop$gender == "Mujer")] <- "Female"
lapop$gender[which(lapop$gender == "Hombre")] <- "Male"
# fix occupational data
lapop$oc_class <- rep(NA,nrow(lapop))
lapop$oc_class[lapop$occupation ==
                 "Trabajador especializado (operador de maquinaria, albañil, etc.)"] <-
  "worker"
lapop$oc_class[lapop$occupation ==
                 "Oficinista (secretaria, operador de maquina de oficina, etc.)"] <-
  "worker"
lapop$oc_class[lapop$occupation ==
                 "Vendedor demostrador en almacenes y mercados"] <- "worker"
lapop$oc_class[lapop$occupation ==
                 "Empleado, fuera de oficina, en el sector de servicios"] <- "worker"
lapop$oc_class[lapop$occupation ==
                 "Peón agrícola (trabaja la tierra para otros)"] <- "worker"
lapop$oc_class[lapop$occupation == "Artesano"] <- "worker"
lapop$oc_class[lapop$occupation == "Servicio doméstico"] <- "worker"
lapop$oc_class[lapop$occupation == "Obrero"] <- "worker"
lapop$oc_class[lapop$occupation ==
                 "Profesional, intelectual y científico"] <- "professional"
lapop$oc_class[lapop$occupation ==
                 "Director (gerente, jefe de departamento, supervisor)"] <- "professional"
lapop$oc_class[lapop$occupation ==
                 "Técnico o profesional de nivel medio (técnico en computación, etc.)"] <-
  "professional"
lapop$oc_class[lapop$occupation ==
                 "Funcionario del gobierno (miembro de los órganos legislativo, etc.)"] <-
  "professional"
lapop$oc_class[lapop$occupation ==
                 "Comerciante (vendedor ambulante, propietario de establecimiento, etc.)"] <-
  "professional"
lapop$oc_class[lapop$occupation ==
                 "Campesino, agricultor, o productor agropecuario y pesquero"] <- "other"
lapop$oc_class[lapop$occupation ==
                 "Miembro de las fuerzas armadas o personal de servicio de protección y seguridad"] <-
  "other"
lapop$occupation <- as.factor(lapop$oc_class)
lapop$oc_class <- NULL
# fix material wealth -- make it binary, then export and factor by country-year
numerics <- which(grepl("r\\d", colnames(lapop)))
for(i in numerics){
  lapop[which(lapop[,i] == "Sí"),i] <- "1"
  lapop[which(lapop[,i] == "Yes"),i] <- "1"
  lapop[which(lapop[,i] == "Uno"),i] <- "1"         # for cars
  lapop[which(lapop[,i] == "Dos"),i] <- "1"         # for cars
  lapop[which(lapop[,i] == "Tres o más"),i] <- "1"  # for cars
  lapop[which(lapop[,i] == "No"),i] <- "0"
  lapop[which(lapop[,i] == "No se le preguntó en este país o año"),i] <- NA
  lapop[which(lapop[,i] == "No Responde"),i] <- NA
  lapop[which(lapop[,i] == "No Sabe"),i] <- NA
  lapop[which(lapop[,i] == "NR"),i] <- NA
  lapop[which(lapop[,i] == "No Aplica"),i] <- NA
  lapop[,i] <- factor(lapop[,i], levels=c("0","1"))
}
tens <- which(colnames(lapop) == "income" | colnames(lapop) == "education")
for(i in tens){
  lapop[which(lapop[,i] == "Ninguno"),i] <- "0"
  lapop[which(lapop[,i] == "Izquierda"),i] <- "1"
  lapop[which(lapop[,i] == "Derecha"),i] <- "10"
  lapop[which(lapop[,i] == "18+"),i] <- "19" # going to be factors anyway
  lapop[which(lapop[,i] == "No Sabe"),i] <- NA
  lapop[which(lapop[,i] == "Don't Know"),i] <- NA
  lapop[which(lapop[,i] == "No Responde"),i] <- NA
  lapop[which(lapop[,i] == "No Response"),i] <- NA
  lapop[which(lapop[,i] == "No Aplica"),i] <- NA
  lapop[which(lapop[,i] == "Not Applicable"),i] <- NA
  lapop[which(lapop[,i] == "No se le preguntó en este país o año"),i] <- NA
  lapop[which(lapop[,i] == "Not asked in this country or year"),i] <- NA
  lapop[which(lapop[,i] == "None"),i] <- NA
  lapop[,i] <- as.numeric(lapop[,i])
}
knowledge <- which(grepl("gi", colnames(lapop)))
for(i in knowledge){
  lapop[which(lapop[,i] == "No Sabe"),i] <- NA
  lapop[which(lapop[,i] == "Don't Know"),i] <- NA
  lapop[which(lapop[,i] == "No Responde"),i] <- NA
  lapop[which(lapop[,i] == "No Response"),i] <- NA
  lapop[which(lapop[,i] == "No Aplica"),i] <- NA
  lapop[which(lapop[,i] == "Not Applicable"),i] <- NA
  lapop[which(lapop[,i] == "No se le preguntó en este país o año"),i] <- NA
  lapop[which(lapop[,i] == "Not asked in this country or year"),i] <- NA
  lapop[which(lapop[,i] == "Correct"),i] <- "Correcto"
  lapop[which(lapop[,i] == "Incorrect"),i] <- "Incorrecto"
}
# clean gi3
lapop$gi3[which(lapop$gi3 == "Correcto")] <- "50"
lapop$gi3[which(!is.na(lapop$gi3) & lapop$gi3 != "50")] <- "Incorrecto"
lapop$gi3[which(lapop$gi3 == "50")] <- "Correcto"
# clean gi2
lapop$gi2[which(lapop$gi2 == "Incorrecto/NS")] <- "Incorrecto"
# make factors
for(i in knowledge){
  lapop[,i] <- factor(as.character(lapop[,i]),
                      levels=c("Incorrecto","Correcto"), labels=c("0","1"))
}
# split into each country-year
templist <- split(lapop, f = list(lapop$country, lapop$year))
templist <- templist[lapply(templist,nrow)>0]
# factor information
for(i in 1:length(templist)){
  print(i)
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  knowmat <- cbind(templist[[i]]$id,templist[[i]][,knowledge])
  colnames(knowmat)[1] <- "id"
  # remove all NA columns
  for(j in ncol(knowmat):2){       # 2 since column 1 is IDs
    if(all(is.na(unique(knowmat[,j]))) == T){
      knowmat <- knowmat[,-j]
    }
  }
  # if no durable goods questions were asked, we skip the country-year
  if(is.null(dim(knowmat)) == T){
    templist[[i]]$knowledge <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","occupation",
                                      "education","income","knowledge","r1", "r2",
                                      "r3", "r4", "r4a", "r4b", "r5", "r6", "r7", 
                                      "r8", "r12", "r13", "r14", "r15", "r16", 
                                      "r17", "r18", "r20", "r21", "r22", "r23", 
                                      "r24", "r25", "r26")]
    next
  }
  # take only complete cases and factor -- sometimes this eliminates all obs,
  # so drop one question in that case
  kmat <- knowmat[complete.cases(knowmat),]
  if(dim(kmat)[2] == 2){
    colnames(knowmat)[2] <- "knowledge"
    stopifnot(nrow(templist[[i]]) == nrow(knowmat))
    knowmat$knowledge <- as.character(knowmat$knowledge)
    knowmat$knowledge[which(knowmat$knowledge == "1")] <- "4"
    templist[[i]]$knowledge <- knowmat$knowledge
    templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","occupation",
                                      "education","income","knowledge","r1", "r2",
                                      "r3", "r4", "r4a", "r4b", "r5", "r6", "r7", 
                                      "r8", "r12", "r13", "r14", "r15", "r16", 
                                      "r17", "r18", "r20", "r21", "r22", "r23", 
                                      "r24", "r25", "r26")]
    next
  }
  # MCA
  facs <- MCA(kmat[,-1], ncp = 5, graph = F)
  # save first factor score
  kmat$knowledge <- facs$ind$coord[,1]
  knowmat <- kmat
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(knowmat)-1)){
    knowmat[,j] <- as.numeric(as.character(knowmat[,j]))
  }
  knowmat <- knowmat[order(rowSums(knowmat[,
                                           2:(ncol(knowmat)-1)]),decreasing=T),]
  if(as.logical(head(knowmat$knowledge, 1) < tail(knowmat$knowledge, 1)) == T){
    knowmat$knowledge <- knowmat$knowledge*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],knowmat, by = "id", all.x = T, all.y = T)
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","occupation",
                                    "education","income","knowledge","r1", "r2",
                                    "r3", "r4", "r4a", "r4b", "r5", "r6", "r7", 
                                    "r8", "r12", "r13", "r14", "r15", "r16", 
                                    "r17", "r18", "r20", "r21", "r22", "r23", 
                                    "r24", "r25", "r26","id")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$knowledge <- jitter(as.numeric(tmpdf$knowledge), factor=10e-8)
  # jitter to allow quantization
  knowqnt <- as.character(qcut(tmpdf$knowledge,5)) # just to get labels
  stopifnot(mean(tmpdf$knowledge[which(knowqnt == max(as.numeric(knowqnt),
                                                      na.rm=T))]) >
              mean(tmpdf$knowledge[which(knowqnt == min(as.numeric(knowqnt),
                                                        na.rm=T))]))
  # make sure factor levels align
  tmpdf$knowledge <- knowqnt
  tmpdf <- tmpdf[,c("id","knowledge")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$knowledge.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "knowledge.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "knowledge"
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","occupation",
                                    "education","income","knowledge","r1", "r2",
                                    "r3", "r4", "r4a", "r4b", "r5", "r6", "r7", 
                                    "r8", "r12", "r13", "r14", "r15", "r16", 
                                    "r17", "r18", "r20", "r21", "r22", "r23", 
                                    "r24", "r25", "r26")]
}
numerics <- which(grepl("r\\d", colnames(templist[[1]])))
# factor material wealth
for(i in 1:length(templist)){
  print(i)
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  matgoods <- cbind(templist[[i]]$id,templist[[i]][,numerics])
  colnames(matgoods)[1] <- "id"
  # remove all NA columns
  for(j in ncol(matgoods):2){                          # 2 since column 1 is IDs
    if(all(is.na(unique(matgoods[,j]))) == T){
      matgoods <- matgoods[,-j]
    }
  }
  # if no durable goods questions were asked, we skip the country-year
  if(is.null(dim(matgoods)) == T){
    templist[[i]]$wealth <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","ideology","gender",
                                      "wealth","occupation","education",
                                      "income","knowledge")]
    next
  }
  # take only complete cases and factor
  matgoods <- matgoods[complete.cases(matgoods),]
  facs <- MCA(matgoods[,-1], ncp = 5, graph = F)
  # save first factor score
  matgoods$wealth <- facs$ind$coord[,1]
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(matgoods)-1)){
    matgoods[,j] <- as.numeric(as.character(matgoods[,j]))
  }
  matgoods <- matgoods[order(rowSums(matgoods[,2:(ncol(matgoods)-1)]),
                             decreasing=T),]
  if(as.logical(head(matgoods$wealth, 1) < tail(matgoods$wealth, 1)) == T){
    matgoods$wealth <- matgoods$wealth*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],matgoods, by = "id", all.x = T,
                         all.y = T)
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","wealth",
                                    "occupation","education","income",
                                    "knowledge","id")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$wealth <- jitter(as.numeric(tmpdf$wealth), factor=10e-8)
  # jitter to allow quantization
  matqnt <- as.character(qcut(tmpdf$wealth,5)) # just to get labels
  stopifnot(mean(tmpdf$wealth[which(matqnt == max(as.numeric(matqnt),
                                                  na.rm=T))]) >
              mean(tmpdf$wealth[which(matqnt == min(as.numeric(matqnt),
                                                    na.rm=T))]))
  # make sure factor levels align
  tmpdf$wealth <- matqnt
  tmpdf <- tmpdf[,c("id","wealth")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$wealth.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "wealth.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "wealth"
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","wealth",
                                    "occupation","education","income",
                                    "knowledge")]
}
# quantize education
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5)) # just to get labels
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
    templist[[i]] <- merge(templist[[i]],tmpdf, by = "id", all.x = T, all.y = T)
    templist[[i]]$education.x <- NULL
    stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "education.y")
    colnames(templist[[i]])[ncol(templist[[i]])] <- "education"
  }
}
# quantize income
for(i in 1:length(templist)){
  print(i)
  # get temp df
  tmpdf <- templist[[i]][,c("id","income")]
  if(all(is.na(unique(templist[[i]]$income))) == T){
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter
    tmpdf$income <- jitter(as.numeric(tmpdf$income), factor=10e-8)
    incqnt <- as.character(qcut(tmpdf$income,5)) # just to get labels
    stopifnot(mean(tmpdf$income[which(incqnt == max(as.numeric(incqnt),
                                                    na.rm=T))]) >
                mean(tmpdf$income[which(incqnt == min(as.numeric(incqnt),
                                                      na.rm=T))]))
    # make sure factor levels align
    tmpdf$income <- incqnt
    templist[[i]] <- merge(templist[[i]],tmpdf, by = "id", all.x = T, all.y = T)
    templist[[i]]$income.x <- NULL
    stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "income.y")
    colnames(templist[[i]])[ncol(templist[[i]])] <- "income"
  }
}
# recombine
lapop <- do.call(rbind, templist)
# make variables the right class/scale
lapop$ideology[lapop$ideology == "Izquierda"] <- "1"
lapop$ideology[lapop$ideology == "Left"] <- "1"
lapop$ideology[lapop$ideology == "Derecha"] <- "10"
lapop$ideology[lapop$ideology == "Right"] <- "10"
lapop$ideology[lapop$ideology == "No Sabe"] <- NA
lapop$ideology[lapop$ideology == "Don't Know"] <- NA
lapop$ideology[lapop$ideology == "No Responde"] <- NA
lapop$ideology[lapop$ideology == "No Response"] <- NA
lapop$ideology[lapop$ideology == "No Aplica"] <- NA
lapop$ideology[lapop$ideology == "Not Applicable"] <- NA
lapop$ideology[lapop$ideology == "No se le preguntó en este país o año"] <- NA
lapop$ideology[lapop$ideology == "Not asked in this country or year"] <- NA
lapop$ideology <- as.numeric(lapop$ideology)
lapop$wealth <- factor(lapop$wealth, levels = c("0","1","2","3","4"))
lapop$education <- factor(lapop$education, levels = c("0","1","2","3","4"))
lapop$income <- factor(lapop$income, levels = c("0","1","2","3","4"))
lapop$knowledge <- factor(lapop$knowledge, levels = c("0","1","2","3","4"))
# save original scale
lapop$ideology_scale_orig <- rep("1-10",nrow(lapop))
# scale everything
lapop$ideology_scaled <- rescalr(lapop$ideology, 1, 10, -1, 1)
lapop$survey <- rep("LAPOP",nrow(lapop))
lapop <- lapop[order(lapop$country,lapop$year),
               c("country","year","survey","ideology","gender","wealth",
                 "occupation","education","income","knowledge","ideology_scaled",
                 "ideology_scale_orig")]
# clean up
write.csv(lapop, "./cleaned-data/lapop.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 79. Birch Ukraine sample - mass
birch <- read_dta("./Birch/stata8/h079a.dta")
birch <- as_factor(birch)
birch <- as.data.frame(birch)
birch <- birch[,c("p173","p192","p207","p202","p193")]
colnames(birch) <- c("ideology","gender","occupation","income","education")
for(i in 1:ncol(birch)){
  birch[,i] <- as.character(birch[,i])
}
# clean ideology
birch$ideology[birch$ideology == "leftist"] <- "1"
birch$ideology[birch$ideology == "centre-left"] <- "2"
birch$ideology[birch$ideology == "centrist"] <- "3"
birch$ideology[birch$ideology == "national-democratic"] <- "4"
birch$ideology[birch$ideology == "ukrainian nationalist"] <- "5"
birch$ideology[which(birch$ideology == "d/k" |
                       birch$ideology == "none of the above" |
                       birch$ideology == "n/a")] <- NA
# clean gender
birch$gender[which(birch$gender == "female")] <- "Female"
birch$gender[which(birch$gender == "male")] <- "Male"
# tidy
birch$year <- rep("1998",nrow(birch))
birch$country <- rep("Ukraine", nrow(birch))
birch$ideology <- as.numeric(birch$ideology)
birch$ideology_scaled <- rescalr(birch$ideology, 1, 5, -1, 1)
birch$ideology_scale_orig <- rep("1-5",nrow(birch))
birch$survey <- rep("Birch 1999",nrow(birch))
# clean income
birch$income[which(birch$income == "n/a" | birch$income == "d/k")] <- NA
birch$income[birch$income == "no income"] <- "0"
birch$income[birch$income == "1-80 hryvna"] <- "1"
birch$income[birch$income == "81-150 hryvna"] <- "2"
birch$income[birch$income == "151-300 hryvna"] <- "3"
birch$income[birch$income == "301-600 hryvna"] <- "4"
birch$income[birch$income == "more than 600 hryvna"] <- "5"
# clean education
birch$education[which(birch$education == "d/k" |
                        birch$education == "n/a")] <- NA
birch$education[birch$education == "less than 4 years"] <- "1"
birch$education[birch$education == "primary (less than 7 years)"] <- "2"
birch$education[birch$education ==
                  "vocational training school after 7-8 years"] <- "3"
birch$education[birch$education ==
                  "incomplete secondary (less than 10 years)"] <- "4"
birch$education[birch$education ==
                  "specialised secondary (technical school, college etc.)"] <-
  "5"
birch$education[birch$education ==
                  "vocational training school after 10-11 years"] <- "6"
birch$education[birch$education ==
                  "complete secondary  (10-11 years)"] <- "7"
birch$education[birch$education ==
                  "incomplete higher (3 years and more)"] <- "8"
birch$education[birch$education == "higher"] <- "9"
# quantize
birch$income <- jitter(as.numeric(birch$income), factor=10e-8)
birch$income <- as.character(qcut(birch$income,5))
# quantize education
birch$education <- jitter(as.numeric(birch$education), factor=10e-8)
birch$education <- as.character(qcut(birch$education,5))
### code occupation
birch$occupation[which(birch$occupation == "accountant" |
                         birch$occupation == "artist" |
                         birch$occupation == "businessman" |
                         birch$occupation == "chief of housing operational office" |
                         birch$occupation == "deputy director joint-stock company" |
                         birch$occupation == "doctor" |
                         birch$occupation == "economist" |
                         birch$occupation == "engineer" |
                         birch$occupation == "enterprise director" |
                         birch$occupation == "financial services manager" |
                         birch$occupation == "hydrogeologist, geodesist" |
                         birch$occupation == "lawyer" |
                         birch$occupation == "pension fund expert" |
                         birch$occupation == "postal director" |
                         birch$occupation == "power engineer" |
                         birch$occupation == "programmer" |
                         birch$occupation == "psychologist" |
                         birch$occupation == "tax inspector" |
                         birch$occupation == "veterinary surgeon")] <- "professional"
birch$occupation[which(birch$occupation == "auto mechanic" |
                         birch$occupation == "boatswain" |
                         birch$occupation == "bodyguard" |
                         birch$occupation == "boiler-room operator" |
                         birch$occupation == "builder" |
                         birch$occupation == "bulldozer operator" |
                         birch$occupation == "butcher" |
                         birch$occupation == "calf-tender, pig-tender" |
                         birch$occupation == "cementer" |
                         birch$occupation == "cleaner" |
                         birch$occupation == "collective farmer" |
                         birch$occupation == "driver" |
                         birch$occupation == "farmer" |
                         birch$occupation == "fisherman" |
                         birch$occupation == "fitter" |
                         birch$occupation == "guard" |
                         birch$occupation == "house-painter, plasterer" |
                         birch$occupation == "insulater" |
                         birch$occupation == "joiner" |
                         birch$occupation == "lift-men" |
                         birch$occupation == "loader" |
                         birch$occupation == "machine operator" |
                         birch$occupation == "mechanic" |
                         birch$occupation == "milkmaid" |
                         birch$occupation == "miner" |
                         birch$occupation == "motor mechanic" |
                         birch$occupation == "moulder" |
                         birch$occupation == "printing industry worker" |
                         birch$occupation == "railway worker" |
                         birch$occupation == "shoe-maker" |
                         birch$occupation == "smith" |
                         birch$occupation == "tractor operator" |
                         birch$occupation == "waiter" |
                         birch$occupation == "welder" |
                         birch$occupation == "woodcutter")] <- "worker"
birch$occupation[which(birch$occupation == "d/k" |
                         birch$occupation == "n/a")] <- NA
birch$occupation[which(!(birch$occupation %in%
                           c("worker","professional",NA)))] <- "other"
# finish up
birch$wealth <- birch$knowledge <- rep(NA,nrow(birch))
birch <- birch[order(birch$country,birch$year),
               c("country","year","survey","ideology","gender","wealth","occupation",
                 "education","income","knowledge","ideology_scaled",
                 "ideology_scale_orig")]
# save for later
write.csv(birch, "./cleaned-data/birch-mass.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### 80. World Values Survey
wvs <- read_dta("./WVS/WVS_Longitudinal_1981_2014_stata_v2015_04_18.dta",
                encoding = 'latin1')
wvs <- as_factor(wvs)
wvs <- as.data.frame(wvs)
wvs <- wvs[,c("S003A","S020","E033","X001","X025","X036","X047")]
colnames(wvs) <- c("country","year","ideology","gender","education",
                   "occupation","income")
wvs$wealth <- rep(NA,nrow(wvs))
wvs$survey <- rep("World Values Survey",nrow(wvs))
for(i in 1:ncol(wvs)){
  wvs[,i] <- as.character(wvs[,i])
}
# fix countries
wvs$country[which(wvs$country == "Bosnia")] <- "Bosnia and Herzegovina"
wvs$country[which(wvs$country == "Czech Rep.")] <- "Czech Republic"
wvs$country[which(wvs$country == "Dominican Rep.")] <- "Dominican Republic"
wvs$country[which(wvs$country == "East Germany")] <- "Germany East"
wvs$country[which(wvs$country == "Egypt")] <- "Egypt, Arab Rep."
wvs$country[which(wvs$country == "Great Britain")] <- "United Kingdom"
wvs$country[which(wvs$country == "Hong Kong")] <- "Hong Kong SAR, China"
wvs$country[which(wvs$country == "Iran")] <- "Iran, Islamic Rep."
wvs$country[which(wvs$country == "Kyrgyzstan")] <- "Kyrgyz Republic"
wvs$country[which(wvs$country == "Macedonia")] <- "Macedonia, FYR"
wvs$country[which(wvs$country == "Palestine")] <- "West Bank and Gaza"
wvs$country[which(wvs$country == "Russia")] <- "Russian Federation"
wvs$country[which(wvs$country == "Slovakia")] <- "Slovak Republic"
wvs$country[which(wvs$country == "South Korea")] <- "Korea, Rep."
wvs$country[which(wvs$country == "Venezuela")] <- "Venezuela, RB"
wvs$country[which(wvs$country == "Viet Nam")] <- "Vietnam"
wvs$country[which(wvs$country == "West Germany")] <- "Germany West"
wvs$country[which(wvs$country == "Yemen")] <- "Yemen, Rep."
# fix gender
wvs$gender[which(!(wvs$gender %in% c("Female", "Male")))] <- NA
# fix ideology - as.numeric() will NA out some non-answers
wvs$ideology[which(wvs$ideology == "-1" |
                     wvs$ideology == "-2" |
                     wvs$ideology == "-3" |
                     wvs$ideology == "-4" |
                     wvs$ideology == "-5")] <- NA
wvs$ideology[wvs$ideology == "Left"] <- "1"
wvs$ideology[wvs$ideology == "Right"] <- "10"
wvs$ideology <- as.numeric(wvs$ideology)
# fix occupation
wvs$occupation[which(wvs$occupation == "-1" |
                       wvs$occupation == "-2" |
                       wvs$occupation == "-3" |
                       wvs$occupation == "-4" |
                       wvs$occupation == "-5")] <- NA
wvs$occupation <- gsub("[^[:alnum:] ]", "", wvs$occupation)
# fix income - again as.numeric will NA some bad answers
wvs$income[which(wvs$income == "-1" |
                   wvs$income == "-2" |
                   wvs$income == "-3" |
                   wvs$income == "-4" |
                   wvs$income == "-5")] <- NA
wvs$income[wvs$income == "Tenth step"] <- "10"
wvs$income[wvs$income == "Nineth step"] <- "9"
wvs$income[wvs$income == "Eigth step"] <- "8"
wvs$income[wvs$income == "Seventh step"] <- "7"
wvs$income[wvs$income == "Sixth step"] <- "6"
wvs$income[wvs$income == "Fifth step"] <- "5"
wvs$income[wvs$income == "Fourth step"] <- "4"
wvs$income[wvs$income == "Third step"] <- "3"
wvs$income[wvs$income == "second step"] <- "2"
wvs$income[wvs$income == "Lower step"] <- "1"
wvs$income <- as.numeric(wvs$income)
# fix education - again as.numeric will create NAs
wvs$education[which(wvs$education == "-1" |
                      wvs$education == "-2" |
                      wvs$education == "-3" |
                      wvs$education == "-4" |
                      wvs$education == "-5")] <- NA
wvs$education[wvs$education ==
                "Inadequately completed elementary education"] <- "1"
wvs$education[wvs$education ==
                "Completed (compulsory) elementary education"] <- "2"
wvs$education[wvs$education ==
                "Incomplete secondary school: technical/vocational type/(Comp"] <- "3"
wvs$education[wvs$education ==
                "Incomplete secondary: university-preparatory type/Secondary,"] <- "4"
wvs$education[wvs$education ==
                "Complete secondary school: technical/vocational type/Seconda"] <- "5"
wvs$education[wvs$education ==
                "Complete secondary: university-preparatory type/Full seconda"] <- "6"
wvs$education[wvs$education ==
                "Some university without degree/Higher education - lower-leve"] <- "7"
wvs$education[wvs$education ==
                "University with degree/Higher education - upper-level tertia"] <- "8"
wvs$education <- as.numeric(wvs$education)
# split into each country-year
templist <- split(wvs, f = list(wvs$country, wvs$year))
for(i in length(templist):1){
  if(nrow(templist[[i]]) == 0){
    templist[[i]] <- NULL
  }
}
# factor income
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","income")]
  if(all(is.na(unique(templist[[i]]$income))) == T){
    templist[[i]]$income <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology",
                                      "gender","wealth","occupation","education",
                                      "income")]
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$income <- jitter(as.numeric(tmpdf$income), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$income,5))
    stopifnot(mean(tmpdf$income[which(eduqnt == max(as.numeric(eduqnt),
                                                    na.rm=T))]) >
                mean(tmpdf$income[which(eduqnt == min(as.numeric(eduqnt),
                                                      na.rm=T))]))
    # make sure factor levels align
    tmpdf$income <- eduqnt
  }
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id")
  templist[[i]]$income.x <- NULL
  names(templist[[i]])[names(templist[[i]]) == 'income.y'] <- "income"
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "wealth","occupation","education","income")]
}
# factor education
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    templist[[i]]$education <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","survey","ideology",
                                      "gender","wealth","occupation","education",
                                      "income")]
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5))
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
  }
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id")
  templist[[i]]$education.x <- NULL
  names(templist[[i]])[names(templist[[i]]) == 'education.y'] <- "education"
  templist[[i]] <- templist[[i]][,c("country","year","survey","ideology","gender",
                                    "wealth","occupation","education","income")]
}
# recombine
wvs <- do.call(rbind, templist)
wvs$knowledge <- rep(NA,nrow(wvs))
### code up occupation
wvs$occupation[which(wvs$occupation == "Unskilled manual" |
                       wvs$occupation == "Never had a job" |
                       wvs$occupation == "Agricultural worker")] <- "worker"
wvs$occupation[which(wvs$occupation ==
                       "Employermanager of establishment with 10 or more employed" |
                       wvs$occupation == "Professional worker" |
                       wvs$occupation == "Supervisory Non manual office worker")] <-
  "professional"
wvs$occupation[which(!(wvs$occupation %in%
                         c("worker","professional",NA)))] <- "other"
# save original scales
wvs$ideology_scale_orig <- rep("1-10",nrow(wvs))
# scale everything
wvs$ideology_scaled <- rescalr(wvs$ideology, 1, 10, -1, 1)
wvs <- wvs[order(wvs$country,wvs$year),
           c("country","year","survey","ideology","gender","wealth","occupation",
             "education","income","knowledge","ideology_scaled",
             "ideology_scale_orig")]
write.csv(wvs, "./cleaned-data/wvs.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### Sweden National Election Study
files <- list.files(path = "./SNES", pattern = "\\.xls$|\\.XLSX$", recursive = T)
files <- paste("./SNES/",files,sep="")
data.list <- vector("list", length(files))
for(i in 1:length(data.list)){
  data.list[[i]] <- read_excel(files[i])
  data.list[[i]] <- as.data.frame(data.list[[i]])
}
# 1985
data.list[[1]] <- data.list[[1]][,c("V201","V365","V256","V257","V258","V259",
                                    "V260","V316","V319")]
names(data.list[[1]]) <- c("ideology","gender","info1","info2","info3","info4",
                           "info5","occupation","education")
data.list[[1]]$year <- rep("1985",nrow(data.list[[1]]))
# 1988
data.list[[2]] <- data.list[[2]][,c("V178","V366","V229","V230","V231","V232","V233",
                                    "V289","V299")]
names(data.list[[2]]) <- c("ideology","gender","info1","info2","info3","info4","info5",
                           "occupation","education")
data.list[[2]]$year <- rep("1988",nrow(data.list[[2]]))
# 1991
data.list[[3]] <- data.list[[3]][,c("v217","v15","v283","v284","v285","v286","v287",
                                    "v288","v289","v356","v359")]
names(data.list[[3]]) <- c("ideology","gender","info1","info2","info3","info4","info5",
                           "info6","info7","occupation","education")
data.list[[3]]$year <- rep("1991",nrow(data.list[[3]]))
# 1994
data.list[[4]] <- data.list[[4]][,c("v212","v12","v295","v296","v297","v298","v299",
                                    "v300","v361","v368")]
names(data.list[[4]]) <- c("ideology","gender","info1","info2","info3","info4","info5",
                           "info6","occupation","education")
data.list[[4]]$year <- rep("1994",nrow(data.list[[4]]))
# 1998
data.list[[5]] <- data.list[[5]][,c("v366","v12","v219","v220","v221","v222","v223",
                                    "v224","v225","v380","v384")]
names(data.list[[5]]) <- c("ideology","gender","info1","info2","info3","info4","info5",
                           "info6","info7","occupation","education")
data.list[[5]]$year <- rep("1998",nrow(data.list[[5]]))
# 2002
data.list[[6]] <- data.list[[6]][,c("V247","V12","V291","V292","V293","V294","V295",
                                    "V296","V297","V298","V490","V500")]
names(data.list[[6]]) <- c("ideology","gender","info1","info2","info3","info4","info5",
                           "info6","info7","info8","occupation","education")
data.list[[6]]$year <- rep("2002",nrow(data.list[[6]]))
# 2006
data.list[[7]] <- data.list[[7]][,c("V596","SEX","V611","V612","V613","V614","V615",
                                    "V616","V617","V618","V765","V772")]
names(data.list[[7]]) <- c("ideology","gender","info1","info2","info3","info4","info5",
                           "info6","info7","info8","occupation","education")
data.list[[7]]$year <- rep("2006",nrow(data.list[[7]]))
# 2010
data.list[[8]] <- data.list[[8]][,c("VU10_V885","VU10_V7030","VU10_V925","VU10_V926",
                                    "VU10_V927","VU10_V928","VU10_V929",
                                    "VU10_V930","VU10_V931","VU10_V932",
                                    "VU10_V933","VU10_V1096","VU10_V1116")]
names(data.list[[8]]) <- c("ideology","gender","info1","info2","info3","info4","info5",
                           "info6","info7","info8","info9","occupation","education")
data.list[[8]]$year <- rep("2010",nrow(data.list[[8]]))
for(i in 1:length(data.list)){
  if(ncol(data.list[[i]]) == 10){
    data.list[[i]]$info6 <- data.list[[i]]$info7 <- data.list[[i]]$info8 <-
      data.list[[i]]$info9 <- rep(NA,nrow(data.list[[i]]))
  }
  if(ncol(data.list[[i]]) == 11){
    data.list[[i]]$info7 <- data.list[[i]]$info8 <- data.list[[i]]$info9 <- 
      rep(NA,nrow(data.list[[i]]))
  }
  if(ncol(data.list[[i]]) == 12){
    data.list[[i]]$info8 <- data.list[[i]]$info9 <- rep(NA,nrow(data.list[[i]]))
  }
  if(ncol(data.list[[i]]) == 13){
    data.list[[i]]$info9 <- rep(NA,nrow(data.list[[i]]))
  }
  data.list[[i]] <- data.list[[i]][,c("year","ideology","gender","info1","info2","info3",
                                      "info4","info5","info6","info7","info8",
                                      "info9","occupation","education")]
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
  }
}
sw <- do.call(rbind, data.list)
rm(data.list,files)
sw$country <- rep("Sweden", nrow(sw))
sw$income <- rep(NA, nrow(sw))
# clean gender
sw$gender[which(sw$gender == "2")] <- "Female"
sw$gender[which(sw$gender == "1")] <- "Male"
# clean ideology
sw$ideology[which(sw$ideology %in% c("77", "88", "8888"))] <- NA
# clean education
sw$education[which(sw$education %in% c("77", "88", "8888"))] <- NA
sw$education[which(sw$education == "8" & 
                     sw$year %in% c("1985", "1988", "1991"))] <- NA
sw$education[which(sw$education == "9" & sw$year == "1985")] <- NA
# clean occupation
sw$occupation[which(sw$occupation %in% c("88", "96", "99", "8888", "9000", 
                                         "9995", "9998", "9999"))] <- NA
sw$occupation[which(sw$occupation == "4")] <- "worker"
sw$occupation[which(sw$occupation %in% c("2", "3"))] <- "professional"
sw$occupation[which(!(sw$occupation %in% c("worker", "professional", NA)))] <- 
  "other"
# clean information variables
infor <- which(grepl("info",names(sw)))
for(i in infor){
  sw[which(sw[,i] %in% c("6", "7", "8", "9","8888")),i] <- NA
}
sw$info1[which(sw$year %in% c("1985","1988","1991") & sw$info1 == "1")] <- "correct"
sw$info1[which(sw$year %in% c("1994","1998","2002","2006","2010") & sw$info1 == "5")] <- "correct"
sw$info2[which(sw$year %in% c("1994","1998","2002","2006","2010") & sw$info2 == "1")] <- "correct"
sw$info2[which(sw$year %in% c("1985","1988","1991") & sw$info2 == "5")] <- "correct"
sw$info3[which(sw$year %in% c("1985","1988","1991","2002","2006") & sw$info3 == "1")] <- "correct"
sw$info3[which(sw$year %in% c("1994","1998","2010") & sw$info3 == "5")] <- "correct"
sw$info4[which(sw$year %in% c("1991","1994","1998","2006") & sw$info4 == "1")] <- "correct"
sw$info4[which(sw$year %in% c("1985","1988","2002","2010") & sw$info4 == "5")] <- "correct"
sw$info5[which(sw$year %in% c("2002","2006","2010") & sw$info5 == "1")] <- "correct"
sw$info5[which(sw$year %in% c("1985","1988","1991","1994","1998") & sw$info5 == "5")] <- "correct"
sw$info6[which(sw$year %in% c("1994","1998","2006","2010") & sw$info6 == "1")] <- "correct"
sw$info6[which(sw$year %in% c("1985","1988","1991","2002") & sw$info6 == "5")] <- "correct"
sw$info7[which(sw$year %in% c("1991","2002") & sw$info7 == "1")] <- "correct"
sw$info7[which(sw$year %in% c("1985","1988","1994","1998","2006","2010") & sw$info7 == "5")] <- "correct"
sw$info8[which(sw$year %in% c("2002","2006","2010") & sw$info8 == "1")] <- "correct"
sw$info8[which(sw$year %in% c("1985","1988","1991","1994","1998") & sw$info8 == "5")] <- "correct"
sw$info9[which(sw$year %in% c("2010") & sw$info9 == "5")] <- "correct"
for(i in infor){
  sw[which(!(sw[,i] %in% c("correct",NA))),i] <- "incorrect"
}
# recode knowledge to 0/1 now that it's clean
for(i in infor){
  sw[which(sw[,i] == "correct"),i] <- "1"
  sw[which(sw[,i] == "incorrect"),i] <- "0"
}
# split into each country-year
templist <- split(sw, f = sw$year)
knowledge <- which(grepl("info", colnames(sw)))
# factor information
for(i in 1:length(templist)){
  print(i)
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  knowmat <- cbind(templist[[i]]$id,templist[[i]][,knowledge])
  colnames(knowmat)[1] <- "id"
  # remove all NA columns
  for(j in ncol(knowmat):2){       # 2 since column 1 is IDs
    if(all(is.na(unique(knowmat[,j]))) == T){
      knowmat <- knowmat[,-j]
    }
  }
  # if no info questions were asked, we skip the country-year
  if(is.null(dim(knowmat)) == T){
    templist[[i]]$knowledge <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","occupation",
                                      "education","income","knowledge")]
    next
  }
  # take only complete cases and factor -- sometimes this eliminates all obs,
  # so drop one question in that case
  kmat <- knowmat[complete.cases(knowmat),]
  if(dim(kmat)[2] == 2){
    colnames(knowmat)[2] <- "knowledge"
    stopifnot(nrow(templist[[i]]) == nrow(knowmat))
    knowmat$knowledge <- as.character(knowmat$knowledge)
    knowmat$knowledge[which(knowmat$knowledge == "1")] <- "4"
    templist[[i]]$knowledge <- knowmat$knowledge
    templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","occupation",
                                      "education","income","knowledge")]
    next
  }
  # MCA
  facs <- MCA(kmat[,-1], ncp = 5, graph = F)
  # save first factor score
  kmat$knowledge <- facs$ind$coord[,1]
  knowmat <- kmat
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(knowmat)-1)){
    knowmat[,j] <- as.numeric(as.character(knowmat[,j]))
  }
  knowmat <- knowmat[order(rowSums(knowmat[,
                                           2:(ncol(knowmat)-1)]),decreasing=T),]
  if(as.logical(head(knowmat$knowledge, 1) < tail(knowmat$knowledge, 1)) == T){
    knowmat$knowledge <- knowmat$knowledge*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],knowmat, by = "id", all.x = T, all.y = T)
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","occupation",
                                    "education","income","knowledge","id")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$knowledge <- jitter(as.numeric(tmpdf$knowledge), factor=10e-8)
  # jitter to allow quantization
  knowqnt <- as.character(qcut(tmpdf$knowledge,5)) # just to get labels
  stopifnot(mean(tmpdf$knowledge[which(knowqnt == max(as.numeric(knowqnt),
                                                      na.rm=T))]) >
              mean(tmpdf$knowledge[which(knowqnt == min(as.numeric(knowqnt),
                                                        na.rm=T))]))
  # make sure factor levels align
  tmpdf$knowledge <- knowqnt
  tmpdf <- tmpdf[,c("id","knowledge")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$knowledge.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "knowledge.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "knowledge"
  templist[[i]] <- templist[[i]][,c("country","year","ideology","gender","occupation",
                                    "education","income","knowledge")]
}
# quantize education
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5)) # just to get labels
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
    templist[[i]] <- merge(templist[[i]],tmpdf, by = "id", all.x = T, all.y = T)
    templist[[i]]$education.x <- NULL
    stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "education.y")
    colnames(templist[[i]])[ncol(templist[[i]])] <- "education"
  }
}
sw <- do.call(rbind, templist)
sw$id <- NULL; row.names(sw) <- NULL
# fill in variables
sw$survey <- rep("SNES", nrow(sw))
sw$wealth <- rep(NA,nrow(sw))
# rescale ideology
sw$ideology_scale_orig <- rep("0-10",nrow(sw))
sw$ideology <- as.numeric(sw$ideology)
sw$ideology_scaled <- rescalr(sw$ideology, 0, 10, -1, 1)
sw <- sw[order(sw$country,sw$year),
         c("country","year","survey","ideology","gender","wealth",
           "occupation","education","income","knowledge",
           "ideology_scaled","ideology_scale_orig")]
write.csv(sw, "./cleaned-data/swmass.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

### Swiss mass data
swiss <- read_dta("./Switzerland-mass/495_Selects_CumulativeFile_Data_1971-2015_v1.0.dta")
swiss <- as_factor(swiss)
swiss <- as.data.frame(swiss)
swiss <- swiss[,c("year","educ","income","lr1", "sex", "sch7a", "sch7r")]
names(swiss) <- c("year", "education", "income", "ideology", "gender", "occupation1", "occupation2")
swiss$wealth <- swiss$knowledge <- rep(NA,nrow(swiss))
# restrict to relevant years
swiss <- swiss[which(swiss$year %in% c(1975,1979)),]
swiss$year <- as.character(swiss$year)
# clean gender
swiss$gender <- as.character(swiss$gender)
swiss$gender[which(swiss$gender == "na")] <- NA
swiss$gender[which(swiss$gender == "male")] <- "Male"
swiss$gender[which(swiss$gender == "female")] <- "Female"
# clean education
swiss$education <- as.character(swiss$education)
swiss$education[which(swiss$education %in% c("-2", "-1"))] <- NA
swiss$education[which(swiss$education %in% c("dk", "na"))] <- NA
swiss$education[which(swiss$education == "primary school")] <- "1"
swiss$education[which(swiss$education == "compulsory education")] <- "2"
swiss$education[which(swiss$education == "vocational education")] <- "3"
swiss$education[which(swiss$education == "diploma school")] <- "4"
swiss$education[which(swiss$education == "high school")] <- "5"
swiss$education[which(swiss$education == "higher vocational training")] <- "6"
swiss$education[which(swiss$education == "university")] <- "7"
# clean income
swiss$income <- as.character(swiss$income)
swiss$income[which(swiss$income == "-2")] <- NA
swiss$income[which(swiss$income == "na")] <- NA
swiss$income[which(swiss$income == "low income")] <- "0"
swiss$income[which(swiss$income %in% c("rather low income", 
                                       " rather low income"))] <- "1"
swiss$income[which(swiss$income == "medium income")] <- "2"
swiss$income[which(swiss$income %in% c("rather high income",
                                       " rather high income"))] <- "3"
swiss$income[which(swiss$income == "high income")] <- "4"
# clean ideology - the as.numeric will produce NA but that's OK
swiss$ideology <- as.character(swiss$ideology)
swiss$ideology[which(swiss$ideology == "left")] <- "0"
swiss$ideology[which(swiss$ideology == "right")] <- "10"
swiss$ideology <- as.numeric(swiss$ideology)
swiss$ideology[which(swiss$ideology < 0)] <- NA
# clean occupation; separate variables for distinct years
swiss$occupation1 <- as.character(swiss$occupation1)
swiss$occupation1[which(swiss$occupation1 == "-2")] <- NA
swiss$occupation1[which(swiss$occupation1 == "na")] <- NA
swiss$occupation1[which(swiss$occupation1 %in% 
                          c("self-employed farmers",
                            "semi/unskilled and farm workers"))] <- "worker"
swiss$occupation1[which(swiss$occupation1 %in% 
                          c("managers, administrative and commercial specialists",
                            "technical experts",
                            "social and cultural specialists"))] <- "professional"
swiss$occupation1[which(!(swiss$occupation1 %in% c("worker", "professional", NA)))] <- "other"
swiss$occupation2 <- as.character(swiss$occupation2)
swiss$occupation2[which(swiss$occupation2 %in% c("-2", "-3"))] <- NA
swiss$occupation2[which(swiss$occupation2 %in% c("1", "2", "3"))] <- "worker"
swiss$occupation2[which(swiss$occupation2 %in% c("6", "7"))] <- "professional"
swiss$occupation2[which(!(swiss$occupation2 %in% c("worker", "professional", NA)))] <- "other"
# merge occupation
swiss$occupation <- ifelse(swiss$year == "1975",
                           swiss$occupation1,
                           swiss$occupation2)
swiss$occupation1 <- swiss$occupation2 <- NULL
# add country and survey
swiss$country <- rep("Switzerland",nrow(swiss))
swiss$survey <- rep("Swiss Election Study", nrow(swiss))
# quantize education
templist <- split(swiss, f = swiss$year)
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5)) # just to get labels
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
    templist[[i]] <- merge(templist[[i]],tmpdf, by = "id", all.x = T, all.y = T)
    templist[[i]]$education.x <- NULL
    stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "education.y")
    colnames(templist[[i]])[ncol(templist[[i]])] <- "education"
  }
}
swiss <- do.call(rbind, templist)
swiss$id <- NULL; row.names(swiss) <- NULL
# finish up
swiss$ideology_scale_orig <- rep("0-10",nrow(swiss))
swiss$ideology_scaled <- rescalr(swiss$ideology, 0, 10, -1, 1)
swiss <- swiss[order(swiss$country,swiss$year),
               c("country","year","survey","ideology","gender","wealth","occupation",
                 "education","income","knowledge","ideology_scaled",
                 "ideology_scale_orig")]
write.csv(swiss, "./cleaned-data/swiss-mass.csv",row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

##### MERGE MASS AND ELITE CLEANED DATA #####

### Create a final merge of all the constituent data frames for each elites
### and mass
files <- list.files(path = "./cleaned-data", pattern = "\\.csv$", recursive = T)
# remove ANES since no longer using it
files <- files[-which(files == "anes.csv")]
files <- paste("./cleaned-data/",files,sep="")
data.list <- lapply(files, function(x) read.csv(x))
# separate data types
mass <- which(files == "./cleaned-data/eurobarometer.csv" |
                files == "./cleaned-data/lapop.csv" |
                files == "./cleaned-data/malaise-citizens.csv" |
                files == "./cleaned-data/fneps-mass.csv" |
                files == "./cleaned-data/birch-mass.csv" |
                files == "./cleaned-data/anes.csv" |
                files == "./cleaned-data/jgss.csv" |
                files == "./cleaned-data/cses.csv" |
                files == "./cleaned-data/swmass.csv" |
                files == "./cleaned-data/swiss-mass.csv" |
                files == "./cleaned-data/wvs.csv")
elite <- seq(1,length(files),1); elite <- setdiff(elite,mass)
# make sure variable classes align
for(i in 1:length(data.list)){
  data.list[[i]]$country <- as.character(data.list[[i]]$country)
  data.list[[i]]$year <- as.character(data.list[[i]]$year)
  data.list[[i]]$survey <- as.character(data.list[[i]]$survey)
  data.list[[i]]$ideology <- as.numeric(as.character(data.list[[i]]$ideology))
  data.list[[i]]$ideology_scaled <-
    as.numeric(as.character(data.list[[i]]$ideology_scaled))
  data.list[[i]]$ideology_scale_orig <-
    as.character(data.list[[i]]$ideology_scale_orig)
}
for(i in mass){
  data.list[[i]]$gender <- as.character(data.list[[i]]$gender)
  data.list[[i]]$wealth <- as.numeric(as.character(data.list[[i]]$wealth))
  data.list[[i]]$occupation <- as.character(data.list[[i]]$occupation)
  data.list[[i]]$education <- as.numeric(as.character(data.list[[i]]$education))
  data.list[[i]]$income <- as.numeric(as.character(data.list[[i]]$income))
  data.list[[i]]$knowledge <- as.numeric(as.character(data.list[[i]]$knowledge))
}
for(i in elite){
  data.list[[i]]$sex <- as.character(data.list[[i]]$sex)
  data.list[[i]]$party <- as.character(data.list[[i]]$party)
}
df.mass <- do.call(rbind, data.list[mass])
df.elite <- do.call(rbind, data.list[elite])
# for ease of coding later, code "worker" as "0", and "professional" as "4",
# comparable to other affluence quintiles
df.mass$occupation[df.mass$occupation == "worker"] <- "0"
df.mass$occupation[df.mass$occupation == "professional"] <- "4"
df.mass$occupation[df.mass$occupation == "other"] <- "2"
# clean up
df.mass <- df.mass[order(df.mass$country, df.mass$year),]
df.elite <- df.elite[order(df.elite$country, df.elite$year),]
write.csv(df.mass,"./final/mass-merged.csv", row.names = F)
write.csv(df.elite,"./final/elite-merged.csv", row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

##### DROP DUPLICATES IN MERGED ELITE DATA #####
mass <- read.csv("./final/mass-merged.csv")
elite <- read.csv("./final/elite-merged.csv")
# align column classes for proper merging/splitting
chars <- which(colnames(mass) != "ideology_scaled")
nums <- which(colnames(mass) == "ideology_scaled")
for(i in chars){
  mass[,i] <- as.character(mass[,i])
}
mass[,nums] <- as.numeric(mass[,nums])
chars <- which(colnames(elite) != "ideology_scaled")
nums <- which(colnames(elite) == "ideology_scaled")
for(i in chars){
  elite[,i] <- as.character(elite[,i])
}
elite[,nums] <- as.numeric(elite[,nums])
### Resolve duplicate sampling
elite.full <- elite
# BRS 1997 sampling done in 1997 using incumbents, BCS done in 1992
elite <- elite[-which(elite$survey == "BRS 1997" & elite$year == "1992"),]
# PELA study 05 done in May-June 1996, versus PELA study 06 done in Aug-Sep 1998
elite <- elite[-which(elite$survey == "PELA study 06" & elite$year == "1997"),]
# PELA study 09 done in 1998 vs PELA study 10 done in 1996
elite <- elite[-which(elite$survey == "PELA study 09" & elite$year == "1997"),]
# PELA study 03 done in 1998 vs PELA study 04 done in 1994
elite <- elite[-which(elite$survey == "PELA study 04" & elite$year == "1997"),]
# PELA study 07 done in 1998 vs PELA study 08 done in 1994
elite <- elite[-which(elite$survey == "PELA study 08" & elite$year == "1997"),]
# PELA study 11 done in 1998 vs PELA study 12 done in 1994
elite <- elite[-which(elite$survey == "PELA study 12" & elite$year == "1997"),]
# PELA study 01 done in 1998 vs PELA study 02 done in 1995
elite <- elite[-which(elite$survey == "PELA study 02" & elite$year == "1997"),]
# BRS 2001 sampling done in 2001 using incumbents vs BRS 1997 done in 1997
elite <- elite[-which(elite$survey == "BRS 2001" & elite$year == "1997"),]
# PELA study 15 done in 1998 vs PELA study 16 done in 1994
elite <- elite[-which(elite$survey == "PELA study 16" & elite$year == "1998"),]
# PELA study 29 done in 2000 vs PELA study 30 done in 1995
elite <- elite[-which(elite$survey == "PELA study 30" & elite$year == "1998"),]
# PELA study 23 done in 1998 vs PELA study 24 done in 1996
elite <- elite[-which(elite$survey == "PELA study 24" & elite$year == "1998"),]
# PELA study 21 done in 1998 vs PELA study 22 done in 1994
elite <- elite[-which(elite$survey == "PELA study 22" & elite$year == "1998"),]
# PELA study 27 done in 2000 vs PELA study 07 done in 1998
elite <- elite[-which(elite$survey == "PELA study 07" & elite$year == "2000"),]
# PELA study 01 done in 1998 vs PELA study 37 done in 2001
elite <- elite[-which(elite$survey == "PELA study 01" & elite$year == "2000"),]
# PELA study 33 done in 2001 vs PELA study 34 done in 1996
elite <- elite[-which(elite$survey == "PELA study 34" & elite$year == "2000"),]
# PELA study 09 done in 1998 vs PELA study 47 done in 2003
elite <- elite[-which(elite$survey == "PELA study 09" & elite$year == "2002"),]
# PELA study 13 done in 1998 vs PELA study 46 done in 2003
elite <- elite[-which(elite$survey == "PELA study 13" & elite$year == "2002"),]
# PELA study 15 done in 1998 vs PELA study 43 done in 2002
elite <- elite[-which(elite$survey == "PELA study 15" & elite$year == "2002"),]
# PELA study 29 done in 2000 vs PELA study 44 done in 2003
elite <- elite[-which(elite$survey == "PELA study 29" & elite$year == "2002"),]
# PELA study 23 done in 1998 vs PELA study 45 done in 2003
elite <- elite[-which(elite$survey == "PELA study 23" & elite$year == "2002"),]
# BLS sampling done in 2001 vs PELA study 55 done in 2005, BLS less comparable
elite <- elite[-which(elite$survey == "Brazilian Legislator Survey v1" &
                        elite$year == "2003"),]
# PELA study 27 done in 2000 vs PELA study 48 done in 2003
elite <- elite[-which(elite$survey == "PELA study 27" & elite$year == "2003"),]
# PELA study 37 done in 2001 vs PELA study 50 done in 2003
elite <- elite[-which(elite$survey == "PELA study 37" & elite$year == "2003"),]
# PELA study 21 done in 1998 vs PELA study 49 done in 2003
elite <- elite[-which(elite$survey == "PELA study 21" & elite$year == "2003"),]
# BLS sampling done in 2001 vs PELA study 55 done in 2005 + BLS less comparable
elite <- elite[-which(elite$survey == "Brazilian Legislator Survey v1" &
                        elite$year == "2004"),]
# PELA study 38 done in 2002 vs PELA study 52 done in 2004
elite <- elite[-which(elite$survey == "PELA study 38" & elite$year == "2004"),]
# ATES CAN 2003 done in 2003 vs ATES HOC 2004 done in 2004, but CAN broader
elite <- elite[-which(elite$survey == "ATES HOC 2004" & elite$year == "2004"),]
# PELA study 41 done in 2002 vs PELA study 53 done in 2004
elite <- elite[-which(elite$survey == "PELA study 41" & elite$year == "2004"),]
# BLS sampling done in 2005 vs PELA study 55 done in 2005, BLS less comparable
elite <- elite[-which(elite$survey == "Brazilian Legislator Survey v1" &
                        elite$year == "2005"),]
# ATES CAN 2003 done in 2003, ATES HOC 2004 done in 2004, ATES CAN 2005 in 2005
elite <- elite[-which(elite$survey == "ATES CAN 2003" & elite$year == "2005"),]
elite <- elite[-which(elite$survey == "ATES HOC 2004" & elite$year == "2005"),]
# PELA study 33 done in 2001 vs PELA study 54 done in 2005
elite <- elite[-which(elite$survey == "PELA study 33" & elite$year == "2005"),]
# BLS sampling done in 2005 vs PELA study 75 done in 2007 + BLS less comparable
elite <- elite[-which(elite$survey == "Brazilian Legislator Survey v1" &
                        elite$year == "2006"),]
elite <- elite[-which(elite$survey == "PELA study 55" & elite$year == "2006"),]
# PELA study 42 done in 2002 vs PELA study 60 done in 2006
elite <- elite[-which(elite$survey == "PELA study 42" & elite$year == "2006"),]
# PELA study 46 done in 2003 vs PELA study 59 done in 2006
elite <- elite[-which(elite$survey == "PELA study 46" & elite$year == "2006"),]
# PELA study 43 done in 2002 vs PELA study 56 done in 2006
elite <- elite[-which(elite$survey == "PELA study 43" & elite$year == "2006"),]
# PELA study 44 done in 2003 vs PELA study 64 done in 2006
elite <- elite[-which(elite$survey == "PELA study 44" & elite$year == "2006"),]
# PELA study 48 done in 2003 vs PELA study 58 done in 2006
elite <- elite[-which(elite$survey == "PELA study 48" & elite$year == "2006"),]
# PELA study 40 done in 2002 vs PELA study 57 done in 2006
elite <- elite[-which(elite$survey == "PELA study 40" & elite$year == "2006"),]
# ATES CAN 2005 done in 2005 vs ATES HOC 2004 done in 2004, and CAN broader
elite <- elite[-which(elite$survey == "ATES HOC 2004" & elite$year == "2006"),]
# PELA study 50 done in 2003 vs PELA study 63 done in 2006
elite <- elite[-which(elite$survey == "PELA study 50" & elite$year == "2006"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2006" &
                        elite$country == "Netherlands"),]
# Pela study 31 done in 2001, 61 done in 2006, 80 done in 2010
elite <- elite[-which(elite$survey == "PELA study 31" & elite$year == "2006"),]
elite <- elite[-which(elite$survey == "PELA study 80" & elite$year == "2006"),]
# Pela study 51 done in 2004, 67 done in 2008, 73 done in 2010
elite <- elite[-which(elite$survey == "PELA study 51" & elite$year == "2007"),]
elite <- elite[-which(elite$survey == "PELA study 73" & elite$year == "2007"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2007" &
                        elite$country == "Belgium"),]
# BLS sampling done in 2005 vs PELA study 55 done in 2010, BLS less comparable
elite <- elite[-which(elite$survey == "Brazilian Legislator Survey v1" &
                        elite$year == "2007"),]
# Pela study 55 done in 2005
elite <- elite[-which(elite$survey == "PELA study 55" & elite$year == "2007"),]
# ATES CAN 2005 done in 2005 vs ATES HOC 2004 done in 2004, and CAN broader
elite <- elite[-which(elite$survey == "ATES HOC 2004" & elite$year == "2007"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2007" &
                        elite$country == "Netherlands"),]
# PELA study 61 done in 2006 vs PELA study 80 done in 2010
elite <- elite[-which(elite$survey == "PELA study 80" & elite$year == "2007"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2007" &
                        elite$country == "Switzerland"),]
# PELA study 67 done in 2008 vs PELA study 73 done in 2010
elite <- elite[-which(elite$survey == "PELA study 73" & elite$year == "2008"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2008" &
                        elite$country == "Austria"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2008" &
                        elite$country == "Belgium"),]
# BLS sampling done in 2005 vs PELA study 75 done in 2010 + BLS less comparable
elite <- elite[-which(elite$survey == "Brazilian Legislator Survey v1" &
                        elite$year == "2008"),]
# PELA study 52 done in 2004 vs PELA study 68 done in 2008
elite <- elite[-which(elite$survey == "PELA study 52" & elite$year == "2008"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2008" &
                        elite$country == "Netherlands"),]
# PELA study 49 done in 2003 vs PELA study 69 done in 2008
elite <- elite[-which(elite$survey == "PELA study 49" & elite$year == "2008"),]
# PELA study 61 done in 2006 vs PELA study 80 done in 2010
elite <- elite[-which(elite$survey == "PELA study 61" & elite$year == "2008"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2008" &
                        elite$country == "Switzerland"),]
# PELA study 67 done in 2008 vs PELA study 73 done in 2010
elite <- elite[-which(elite$survey == "PELA study 67" & elite$year == "2009"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2009" &
                        elite$country == "Austria"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2009" &
                        elite$country == "Belgium"),]
# BLS sampling done in 2005 vs PELA study 75 done in 2010 + BLS less comparable
elite <- elite[-which(elite$survey == "Brazilian Legislator Survey v1" &
                        elite$year == "2009"),]
# PELA study 58 done in 2006 vs PELA study 70 done in 2009
elite <- elite[-which(elite$survey == "PELA study 58" & elite$year == "2009"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2009" &
                        elite$country == "Germany"),]
# PELA study 63 done in 2006 vs PELA study 79 done in 2010
elite <- elite[-which(elite$survey == "PELA study 63" & elite$year == "2009"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2009" &
                        elite$country == "Netherlands"),]
# PELA study 53 done in 2004 vs PELA study 71 done in 2009
elite <- elite[-which(elite$survey == "PELA study 53" & elite$year == "2009"),]
# PELA study 61 done in 2006 vs PELA study 80 done in 2010
elite <- elite[-which(elite$survey == "PELA study 61" & elite$year == "2009"),]
# PARENEL narrower than PartiRep, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PARENEL 2008" & elite$year == "2009" &
                        elite$country == "Portugal"),]
elite <- elite[-which(elite$survey == "PARENEL 2012" & elite$year == "2009" &
                        elite$country == "Portugal"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2009" &
                        elite$country == "Switzerland"),]
# PELA study 67 done in 2008 vs PELA study 73 done in 2010
elite <- elite[-which(elite$survey == "PELA study 67" & elite$year == "2010"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2010" &
                        elite$country == "Austria"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2010" &
                        elite$country == "Belgium"),]
# BLS sampling done in 2009 vs PELA study 75 done in 2010 + BLS less comparable
elite <- elite[-which(elite$survey == "Brazilian Legislator Survey v1" &
                        elite$year == "2010"),]
# PELA study 60 done in 2006 vs PELA study 77 done in 2010
elite <- elite[-which(elite$survey == "PELA study 60" & elite$year == "2010"),]
# PELA study 59 done in 2006 vs PELA study 83 done in 2010/11
elite <- elite[-which(elite$survey == "PELA study 59" & elite$year == "2010"),]
# PELA study 56 done in 2006 vs PELA study 78 done in 2010
elite <- elite[-which(elite$survey == "PELA study 56" & elite$year == "2010"),]
# PELA study 64 done in 2006 vs PELA study 82 done in 2011
elite <- elite[-which(elite$survey == "PELA study 64" & elite$year == "2010"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2010" &
                        elite$country == "Germany"),]
# PELA study 57 done in 2006 vs PELA study 74 done in 2010
elite <- elite[-which(elite$survey == "PELA study 57" & elite$year == "2010"),]
# CCS has broader coverage than PartiRep and HES
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2010" &
                        elite$country == "Hungary"),]
elite <- elite[-which(elite$survey == "Hungarian Election Study" &
                        elite$year == "2010" & elite$country == "Hungary"),]
# PartiRep has the right parties
elite <- elite[-which(elite$survey == "Comparative Candidates Survey" &
                        elite$year == "2010" &
                        elite$country == "Netherlands"),]
# PELA study 61 done in 2006 vs PELA study 80 done in 2010
elite <- elite[-which(elite$survey == "PELA study 61" & elite$year == "2010"),]
# PARENEL narrower than PartiRep, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PARENEL 2012" & elite$year == "2010" &
                        elite$country == "Portugal"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2010" &
                        elite$country == "Switzerland"),]
# PELA study 54 done in 2005 vs PELA study 76 done in 2010
elite <- elite[-which(elite$survey == "PELA study 54" & elite$year == "2010"),]
# PELA study 67 done in 2008 vs PELA study 73 done in 2010
elite <- elite[-which(elite$survey == "PELA study 67" & elite$year == "2011"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2011" &
                        elite$country == "Austria"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2011" &
                        elite$country == "Germany"),]
# CCS has broader coverage than HES
elite <- elite[-which(elite$survey == "Hungarian Election Study" &
                        elite$year == "2011" & elite$country == "Hungary"),]
# PELA study 61 done in 2006 vs PELA study 80 done in 2010 vs 84 in 2011
elite <- elite[-which(elite$survey == "PELA study 61" & elite$year == "2011"),]
elite <- elite[-which(elite$survey == "PELA study 80" & elite$year == "2011"),]
# PARENEL narrower than CCS, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PARENEL 2012" & elite$year == "2011" &
                        elite$country == "Portugal"),]
# PartiRep narrower than CCS, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2011" &
                        elite$country == "Portugal"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2012" &
                        elite$country == "Austria"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2012" &
                        elite$country == "Germany"),]
# PELA study 68 done in 2008 vs PELA study 85 done in 2012
elite <- elite[-which(elite$survey == "PELA study 68" & elite$year == "2012"),]
# CCS has broader coverage than HES
elite <- elite[-which(elite$survey == "Hungarian Election Study" &
                        elite$year == "2012" & elite$country == "Hungary"),]
# PARENEL narrower than PartiRep, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2012" &
                        elite$country == "Portugal"),]
# PARENEL narrower than CCS, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PARENEL 2012" & elite$year == "2012" &
                        elite$country == "Portugal"),]
# PELA study 73 not matched to mass sample like Joignant et al
elite <- elite[-which(elite$survey == "PELA study 73" & elite$year == "2013"),]
# PartiRep narrower than CCS, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2013" &
                        elite$country == "Austria"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2013" &
                        elite$country == "Germany"),]
# CCS has broader coverage than HES
elite <- elite[-which(elite$survey == "Hungarian Election Study" &
                        elite$year == "2013" & elite$country == "Hungary"),]
# PartiRep narrower than CCS, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2013" &
                        elite$country == "Italy"),]
# PELA study 69 done in 2008 vs PELA study 91 done in 2013
elite <- elite[-which(elite$survey == "PELA study 69" & elite$year == "2013"),]
# PARENEL narrower than PartiRep, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2013" &
                        elite$country == "Portugal"),]
# PARENEL narrower than CCS, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PARENEL 2012" & elite$year == "2013" &
                        elite$country == "Portugal"),]
# PELA study 83 done in 2010/11 vs PELA study 95 done in 2014
elite <- elite[-which(elite$survey == "PELA study 83" & elite$year == "2014"),]
# PELA study 78 done in 2010 vs PELA study 93 done in 2014
elite <- elite[-which(elite$survey == "PELA study 78" & elite$year == "2014"),]
# PELA study 74 done in 2010/11 vs PELA study 92 done in 2014
elite <- elite[-which(elite$survey == "PELA study 74" & elite$year == "2014"),]
# CCS has broader coverage than HES
elite <- elite[-which(elite$survey == "Hungarian Election Study" &
                        elite$year == "2014" & elite$country == "Hungary"),]
# PARENEL narrower than CCS, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PARENEL 2012" & elite$year == "2014" &
                        elite$country == "Portugal"),]
# PARENEL narrower than CCS, ignore MEPs v MPs
elite <- elite[-which(elite$survey == "PARENEL 2012" & elite$year == "2015" &
                        elite$country == "Portugal"),]
# CCS has broader coverage than PartiRep -- twice the N in our full data
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2010" &
                        elite$country == "United Kingdom"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2011" &
                        elite$country == "United Kingdom"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2012" &
                        elite$country == "United Kingdom"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2013" &
                        elite$country == "United Kingdom"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2014" &
                        elite$country == "United Kingdom"),]
elite <- elite[-which(elite$survey == "PartiRep" & elite$year == "2015" &
                        elite$country == "United Kingdom"),]
# EPRG MEP wave 2 in 2006 vs EPRG MEP wave 1 in 2000
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Austria"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Belgium"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Denmark"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Finland"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "France"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Germany"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Greece"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Ireland"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Italy"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Luxembourg"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Netherlands"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Portugal"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Spain"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "Sweden"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 1" &
                        elite$year == "2004" & elite$country == "United Kingdom"),]
# EPRG MEP wave 3 in 2010 vs EPRG MEP wave 2 in 2006
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Austria"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Belgium"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Cyprus"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Czech Republic"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Denmark"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Estonia"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Finland"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "France"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Germany"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Greece"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Hungary"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Ireland"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Italy"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Latvia"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Lithuania"),]
# delete wave 3 for Lux because all NA
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2009" & elite$country == "Luxembourg"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Malta"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Netherlands"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Poland"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Portugal"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Slovak Republic"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Slovenia"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Spain"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "Sweden"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 2" &
                        elite$year == "2009" & elite$country == "United Kingdom"),]
# EPRG MEP wave 4 in 2015 vs EPRG MEP wave 3 in 2010
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Austria"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Belgium"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Bulgaria"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Cyprus"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Czech Republic"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Denmark"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Estonia"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Finland"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "France"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Germany"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Greece"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Hungary"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Ireland"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Italy"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Latvia"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Lithuania"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Netherlands"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Poland"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Portugal"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Romania"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Slovenia"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "Spain"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" &
                        elite$year == "2014" & elite$country == "United Kingdom"),]
# Sweden RDU surveys conducted at beginning of term so drop lame ducks
elite <- elite[-which(elite$survey == "Sweden RDU wave 1" & elite$year == "1988"),]
elite <- elite[-which(elite$survey == "Sweden RDU wave 4" & elite$year == "1996"),]
elite <- elite[-which(elite$survey == "Sweden RDU wave 5" & elite$year == "1998"),]
elite <- elite[-which(elite$survey == "Sweden RDU wave 6" & elite$year == "2002"),]
elite <- elite[-which(elite$survey == "Sweden RDU wave 7" & elite$year == "2006"),]
elite <- elite[-which(elite$survey == "Sweden RDU wave 8" & elite$year == "2010"),]
elite <- elite[-which(elite$survey == "Sweden RDU wave 9" & elite$year == "2014"),]
elite <- elite[-which(elite$survey == "EPRG MEP Survey Wave 3" & elite$year == "2014"),]

### Create "no MEP" sample
elite.nomeps <- elite
elite.nomeps <- elite.nomeps[-which(grepl("CIRCAP",elite.nomeps$survey)),]
elite.nomeps <- elite.nomeps[-which(elite.nomeps$survey ==
                                      "EPRG MEP Survey Wave 1"),]
elite.nomeps <- elite.nomeps[-which(elite.nomeps$survey ==
                                      "EPRG MEP Survey Wave 2"),]
elite.nomeps <- elite.nomeps[-which(elite.nomeps$survey ==
                                      "EPRG MEP Survey Wave 3"),]
elite.nomeps <- elite.nomeps[-which(elite.nomeps$survey ==
                                      "EPRG MEP Survey Wave 4"),]
elite.nomeps <- elite.nomeps[-which(elite.nomeps$survey ==
                                      "Flash Eurobarometer 1996"),]

##### START CALCULATING EMD #####
### run the loop on "all" data
df.final$mscale_full <- df.final$escale_full <- df.final$esurvey_full <-
  df.final$emd_all_full <- df.final$emd_lessaffluent_full <-
  df.final$emd_midloaffluent_full <- df.final$emd_midaffluent_full <- 
  df.final$emd_midhiaffluent_full <- df.final$emd_moreaffluent_full <-
  df.final$nobs_elite_full <- df.final$emean_full <- df.final$evar_full <-
  df.final$emd_lessaffluent_hiinfo_full <-
  df.final$emd_lessaffluent_loinfo_full <-
  df.final$emd_midloaffluent_hiinfo_full <-
  df.final$emd_midloaffluent_loinfo_full <-
  df.final$emd_midaffluent_hiinfo_full <-
  df.final$emd_midaffluent_loinfo_full <-
  df.final$emd_midhiaffluent_hiinfo_full <-
  df.final$emd_midhiaffluent_loinfo_full <-
  df.final$emd_moreaffluent_hiinfo_full <-
  df.final$emd_moreaffluent_loinfo_full <-
  rep(NA,nrow(df.final))
# set up temp storage
mlist <- as.list(NULL)
elist <- as.list(NULL)
# break out the data -- takes a while
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mlist[[i]] <- mass[which(mass$country == df.final$country[i] &
                             mass$year == df.final$year[i]),]
  elist[[i]] <- elite.full[which(elite.full$country == df.final$country[i] &
                                   elite.full$year == df.final$year[i]),]
  if(nrow(mlist[[i]]) > 0){
    df.final$msurvey[i] <- paste0(unique(mlist[[i]]$survey),collapse=", ")
  }
  if(nrow(elist[[i]]) > 0){
    df.final$esurvey_full[i] <-  paste0(unique(elist[[i]]$survey),collapse=", ")
  }
  setTxtProgressBar(pb, i)
}
# run it
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  
  if(nrow(edata[which(!is.na(edata$ideology)),]) == 0 |
     nrow(mdata[which(!is.na(mdata$ideology)),]) == 0){
    next
  } else {
    # create a temporary data frame of class measures, ranked by our preference
    class <- data.frame(cbind(mdata$country, mdata$wealth, mdata$income,
                              mdata$occupation, mdata$education))
    colnames(class) <- c("country","wealth","income","occupation","education")
    # delete all NAs
    for(j in ncol(class):2){
      if(all(is.na(unique(class[,j]))) == T){
        class <- class[,-j]
      }
    }
    # make sure the loop uses our best class measure
    whichclass <- colnames(class)[2] # since 1 is country
    colnames(mdata)[grep(whichclass, names(mdata))] <- "class"
    
    # create temporary data for knowledge
    info <- data.frame(cbind(mdata$country, mdata$knowledge, mdata$education))
    colnames(info) <- c("country","knowledge","education")
    for(j in ncol(info):2){
      if(all(is.na(unique(info[,j]))) == T){
        info <- info[,-j]
      }
    }
    # make sure the loop uses our best class measure
    whichinfo <- colnames(info)[2]
    colnames(mdata)[grep(whichinfo, names(mdata))] <- "know"
    
    ### Elite
    y <- as.matrix(na.omit(edata$ideology_scaled))
    w.y <- as.matrix(rep(1/nrow(data.frame(na.omit(edata$ideology_scaled))),
                         nrow(data.frame(na.omit(edata$ideology_scaled)))))
    
    ### All mass
    x <- as.matrix(na.omit(mdata$ideology_scaled))
    w.x <- as.matrix(rep(1/nrow(data.frame(na.omit(mdata$ideology_scaled))),
                         nrow(data.frame(na.omit(mdata$ideology_scaled)))))
    
    ### Less affluent mass
    x.lessaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0")])))))
    
    ### Midlo affluent mass
    x.midloaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1")]))
    w.x.midloaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1")])))))
    
    ### Mid affluent mass
    x.midaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2")])))))
    
    ### Midhi affluent mass
    x.midhiaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3")]))
    w.x.midhiaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3")])))))
    
    ### More affluent mass
    x.moreaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4")])))))
    
    ### Less affluent mass, high info
    x.lessaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "4")]))
    w.x.lessaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0" &
                                      mdata$know == "4")])))))
    
    ### Midlo affluent mass, high info
    x.midloaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "4")]))
    w.x.midloaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1" &
                                      mdata$know == "4")])))))
    
    ### Mid affluent mass, high info
    x.midaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "4")]))
    w.x.midaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2" &
                                      mdata$know == "4")])))))
    
    ### Midhi affluent mass, high info
    x.midhiaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "4")]))
    w.x.midhiaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3" &
                                      mdata$know == "4")])))))
    
    ### More affluent mass, high info
    x.moreaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "4")]))
    w.x.moreaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4" &
                                      mdata$know == "4")])))))
    
    ### Less affluent mass, low info
    x.lessaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "0")]))
    w.x.lessaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0" &
                                      mdata$know == "0")])))))
    
    ### Midlo affluent mass, low info
    x.midloaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "0")]))
    w.x.midloaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1" &
                                      mdata$know == "0")])))))
    
    ### Mid affluent mass, low info
    x.midaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "0")]))
    w.x.midaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2" &
                                      mdata$know == "0")])))))
    
    ### Mid affluent mass, low info
    x.midhiaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "0")]))
    w.x.midhiaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3" &
                                      mdata$know == "0")])))))
    
    ### More affluent mass, low info
    x.moreaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "0")]))
    w.x.moreaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4" &
                                      mdata$know == "0")])))))
    
    # ensure everything is going correctly
    if(length(x.lessaffluent) == 0 | length(x.moreaffluent) == 0 |
       length(y) == 0){
      warning("check the if-length condition!")
      next
    } else {
      
      # some defensive programming checks
      stopifnot(all.equal(sum(w.x), 1))
      stopifnot(all.equal(sum(w.y), 1))
      if(length(x.lessaffluent) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent), 1))
      }
      if(length(x.midloaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent), 1))
      }
      if(length(x.midaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent), 1))
      }
      if(length(x.midhiaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent), 1))
      }
      if(length(x.moreaffluent) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent), 1))
      }
      if(length(x.lessaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent.lo), 1))
      }
      if(length(x.lessaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent.hi), 1))
      }
      if(length(x.midloaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent.lo), 1))
      }
      if(length(x.midloaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent.hi), 1))
      }
      if(length(x.midaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent.lo), 1))
      }
      if(length(x.midaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent.hi), 1))
      }
      if(length(x.midhiaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.lo), 1))
      }
      if(length(x.midhiaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.hi), 1))
      }
      if(length(x.moreaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent.lo), 1))
      }
      if(length(x.moreaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent.hi), 1))
      }
      
      # EMD for all
      df.final$emd_all_full[which(df.final$country == unique(mdata$country) &
                                    df.final$year == unique(mdata$year))] <-
        emdw(x,w.x,y,w.y,max.iter = 100000)
      
      # EMD by affluence
      df.final$emd_lessaffluent_full[which(df.final$country ==
                                             unique(mdata$country) &
                                             df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midloaffluent_full[which(df.final$country ==
                                              unique(mdata$country) &
                                              df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent,w.x.midloaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midaffluent_full[which(df.final$country ==
                                            unique(mdata$country) &
                                            df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midhiaffluent_full[which(df.final$country ==
                                              unique(mdata$country) &
                                              df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent,w.x.midhiaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_moreaffluent_full[which(df.final$country ==
                                             unique(mdata$country) &
                                             df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
      
      # EMD, less affluent, by political knowledge
      df.final$emd_lessaffluent_hiinfo_full[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent.hi,w.x.lessaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_lessaffluent_loinfo_full[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent.lo,w.x.lessaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, midlo affluent, by political knowledge
      df.final$emd_midloaffluent_hiinfo_full[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent.hi,w.x.midloaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midloaffluent_loinfo_full[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent.lo,w.x.midloaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, mid affluent, by political knowledge
      df.final$emd_midaffluent_hiinfo_full[which(df.final$country ==
                                                   unique(mdata$country) &
                                                   df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent.hi,w.x.midaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midaffluent_loinfo_full[which(df.final$country ==
                                                   unique(mdata$country) &
                                                   df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent.lo,w.x.midaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, midhi affluent, by political knowledge
      df.final$emd_midhiaffluent_hiinfo_full[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent.hi,w.x.midhiaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midhiaffluent_loinfo_full[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent.lo,w.x.midhiaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, more affluent, by political knowledge
      df.final$emd_moreaffluent_hiinfo_full[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent.hi,w.x.moreaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_moreaffluent_loinfo_full[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent.lo,w.x.moreaffluent.lo,y,w.y,max.iter = 100000)
      
      ### store mean and variance
      df.final$evar_full[which(df.final$country == unique(mdata$country) &
                                 df.final$year == unique(mdata$year))] <- sd(y)^2
      df.final$emean_full[which(df.final$country == unique(mdata$country) &
                                  df.final$year == unique(mdata$year))] <- mean(y)
      
      ### Store meta-data
      df.final$nobs_elite_full[which(df.final$country ==
                                       unique(mdata$country) &
                                       df.final$year == unique(mdata$year))] <-
        length(y)
      df.final$mscale_full[which(df.final$country ==
                                   unique(mdata$country) &
                                   df.final$year == unique(mdata$year))] <-
        paste(unique(mdata$ideology_scale_orig),collapse=", ")
      df.final$escale_full[which(df.final$country ==
                                   unique(mdata$country) &
                                   df.final$year == unique(mdata$year))] <-
        paste(unique(edata$ideology_scale_orig),collapse=", ")
    }
  }
  setTxtProgressBar(pb, i)
}

### run the loop on the unweighted sample
df.final$escale <- df.final$mscale <- df.final$evar <- df.final$mvar <-
  df.final$emean <- df.final$mmean <- df.final$emd_all <- df.final$mvar_poor <-
  df.final$mvar_midlo <- df.final$mvar_mid <- df.final$mvar_midhi <- 
  df.final$mvar_rich <- df.final$mmean_poor <- df.final$mmean_midlo <-
  df.final$mmean_mid <- df.final$mmean_midhi <- df.final$mmean_rich <- 
  df.final$nobs_mass_mid <- df.final$nobs_mass_midlo <- df.final$nobs_mass_midhi <-
  df.final$emd_lessaffluent_hiinfo <- df.final$emd_lessaffluent_loinfo <-
  df.final$emd_midloaffluent_hiinfo <- df.final$emd_midloaffluent_loinfo <-
  df.final$emd_midaffluent_hiinfo <- df.final$emd_midaffluent_loinfo <-
  df.final$emd_midhiaffluent_hiinfo <- df.final$emd_midhiaffluent_loinfo <-
  df.final$emd_moreaffluent_hiinfo <- df.final$emd_moreaffluent_loinfo <-
  df.final$mknow_var <- df.final$mclass_var <- df.final$emd_midloaffluent <- 
  df.final$emd_midaffluent <- df.final$emd_midhiaffluent <- 
  rep(NA,nrow(df.final))
elist <- as.list(NULL)
# break out the data
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  elist[[i]] <- elite[which(elite$country == df.final$country[i] &
                              elite$year == df.final$year[i]),]
  if(nrow(elist[[i]]) > 0){
    df.final$esurvey[i] <-  paste0(unique(elist[[i]]$survey),collapse=", ")
  }
  setTxtProgressBar(pb, i)
}
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  
  if(nrow(edata[which(!is.na(edata$ideology)),]) == 0 |
     nrow(mdata[which(!is.na(mdata$ideology)),]) == 0){
    next
  } else {
    # create a temporary data frame of class measures, ranked by our preference
    class <- data.frame(cbind(mdata$country, mdata$wealth, mdata$income,
                              mdata$occupation))
    colnames(class) <- c("country","wealth","income","occupation")
    # delete all NAs
    for(j in ncol(class):2){
      if(all(is.na(unique(class[,j]))) == T){
        class <- class[,-j]
      }
    }
    # make sure the loop uses our best class measure
    whichclass <- colnames(class)[2]
    # store which class measure
    df.final$mclass_var[which(df.final$country == unique(mdata$country) &
                                df.final$year == unique(mdata$year))] <-
      whichclass
    colnames(mdata)[grep(whichclass, names(mdata))] <- "class"
    
    # create temporary data for knowledge -- country just so it stays a df
    info <- data.frame(cbind(mdata$country, mdata$knowledge, mdata$education))
    colnames(info) <- c("country","knowledge","education")
    for(j in ncol(info):2){
      if(all(is.na(unique(info[,j]))) == T){
        info <- info[,-j]
      }
    }
    # make sure the loop uses our best class measure
    whichinfo <- colnames(info)[2]
    # store which knowledge variable we use
    df.final$mknow_var[which(df.final$country == unique(mdata$country) &
                               df.final$year == unique(mdata$year))] <-
      whichinfo
    colnames(mdata)[grep(whichinfo, names(mdata))] <- "know"
    
    ### Elite
    y <- as.matrix(na.omit(edata$ideology_scaled))
    w.y <- as.matrix(rep(1/nrow(data.frame(na.omit(edata$ideology_scaled))),
                         nrow(data.frame(na.omit(edata$ideology_scaled)))))
    
    ### All mass
    x <- as.matrix(na.omit(mdata$ideology_scaled))
    w.x <- as.matrix(rep(1/nrow(data.frame(na.omit(mdata$ideology_scaled))),
                         nrow(data.frame(na.omit(mdata$ideology_scaled)))))
    
    ### Less affluent mass
    x.lessaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0")])))))
    
    ### Midlo affluent mass
    x.midloaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1")]))
    w.x.midloaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1")])))))
    
    ### Mid affluent mass
    x.midaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2")])))))
    
    ### Midhi affluent mass
    x.midhiaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3")]))
    w.x.midhiaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3")])))))
    
    ### More affluent mass
    x.moreaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4")])))))
    
    ### Less affluent mass, high info
    x.lessaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "4")]))
    w.x.lessaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0" &
                                      mdata$know == "4")])))))
    
    ### Midlo affluent mass, high info
    x.midloaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "4")]))
    w.x.midloaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1" &
                                      mdata$know == "4")])))))
    
    ### Mid affluent mass, high info
    x.midaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "4")]))
    w.x.midaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2" &
                                      mdata$know == "4")])))))
    
    ### Midhi affluent mass, high info
    x.midhiaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "4")]))
    w.x.midhiaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3" &
                                      mdata$know == "4")])))))
    
    ### More affluent mass, high info
    x.moreaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "4")]))
    w.x.moreaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4" &
                                      mdata$know == "4")])))))
    
    ### Less affluent mass, low info
    x.lessaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "0")]))
    w.x.lessaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0" &
                                      mdata$know == "0")])))))
    
    ### Midlo affluent mass, low info
    x.midloaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "0")]))
    w.x.midloaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1" &
                                      mdata$know == "0")])))))
    
    ### Mid affluent mass, low info
    x.midaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "0")]))
    w.x.midaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2" &
                                      mdata$know == "0")])))))
    
    ### Midhi affluent mass, low info
    x.midhiaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "0")]))
    w.x.midhiaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3" &
                                      mdata$know == "0")])))))
    
    ### More affluent mass, low info
    x.moreaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "0")]))
    w.x.moreaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4" &
                                      mdata$know == "0")])))))
    
    # ensure everything is going correctly
    if(length(x.lessaffluent) == 0 | length(x.moreaffluent) == 0 |
       length(y) == 0){
      warning("check the if-length condition!")
      next
    } else {
      
      # some defensive programming checks
      stopifnot(all.equal(sum(w.x), 1))
      stopifnot(all.equal(sum(w.y), 1))
      if(length(x.lessaffluent) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent), 1))
      }
      if(length(x.midloaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent), 1))
      }
      if(length(x.midaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent), 1))
      }
      if(length(x.midhiaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent), 1))
      }
      if(length(x.moreaffluent) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent), 1))
      }
      if(length(x.lessaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent.lo), 1))
      }
      if(length(x.lessaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent.hi), 1))
      }
      if(length(x.midloaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent.lo), 1))
      }
      if(length(x.midhiaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.hi), 1))
      }
      if(length(x.midaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent.lo), 1))
      }
      if(length(x.midaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent.hi), 1))
      }
      if(length(x.midhiaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.lo), 1))
      }
      if(length(x.midhiaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.hi), 1))
      }
      if(length(x.moreaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent.lo), 1))
      }
      if(length(x.moreaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent.hi), 1))
      }
      
      ### Compute EMDs
      # EMD for all
      df.final$emd_all[which(df.final$country == unique(mdata$country) &
                               df.final$year == unique(mdata$year))] <-
        emdw(x,w.x,y,w.y,max.iter = 100000)
      
      # EMD by affluence
      df.final$emd_lessaffluent[which(df.final$country ==
                                        unique(mdata$country) &
                                        df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midloaffluent[which(df.final$country ==
                                         unique(mdata$country) &
                                         df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent,w.x.midloaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midaffluent[which(df.final$country ==
                                       unique(mdata$country) &
                                       df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midhiaffluent[which(df.final$country ==
                                         unique(mdata$country) &
                                         df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent,w.x.midhiaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_moreaffluent[which(df.final$country ==
                                        unique(mdata$country) &
                                        df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
      
      # EMD, less affluent, by political knowledge
      df.final$emd_lessaffluent_hiinfo[which(df.final$country ==
                                               unique(mdata$country) &
                                               df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent.hi,w.x.lessaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_lessaffluent_loinfo[which(df.final$country ==
                                               unique(mdata$country) &
                                               df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent.lo,w.x.lessaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, midlo affluent, by political knowledge
      df.final$emd_midloaffluent_hiinfo[which(df.final$country ==
                                                unique(mdata$country) &
                                                df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent.hi,w.x.midloaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midloaffluent_loinfo[which(df.final$country ==
                                                unique(mdata$country) &
                                                df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent.lo,w.x.midloaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, mid affluent, by political knowledge
      df.final$emd_midaffluent_hiinfo[which(df.final$country ==
                                              unique(mdata$country) &
                                              df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent.hi,w.x.midaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midaffluent_loinfo[which(df.final$country ==
                                              unique(mdata$country) &
                                              df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent.lo,w.x.midaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, midhi affluent, by political knowledge
      df.final$emd_midhiaffluent_hiinfo[which(df.final$country ==
                                                unique(mdata$country) &
                                                df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent.hi,w.x.midhiaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midhiaffluent_loinfo[which(df.final$country ==
                                                unique(mdata$country) &
                                                df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent.lo,w.x.midhiaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, more affluent, by political knowledge
      df.final$emd_moreaffluent_hiinfo[which(df.final$country ==
                                               unique(mdata$country) &
                                               df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent.hi,w.x.moreaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_moreaffluent_loinfo[which(df.final$country ==
                                               unique(mdata$country) &
                                               df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent.lo,w.x.moreaffluent.lo,y,w.y,max.iter = 100000)
      
      ### storea means, variances, etc
      df.final$evar[which(df.final$country == unique(mdata$country) &
                            df.final$year == unique(mdata$year))] <- sd(y)^2
      df.final$mvar[which(df.final$country == unique(mdata$country) &
                            df.final$year == unique(mdata$year))] <- sd(x)^2
      df.final$emean[which(df.final$country == unique(mdata$country) &
                             df.final$year == unique(mdata$year))] <- mean(y)
      df.final$mmean[which(df.final$country == unique(mdata$country) &
                             df.final$year == unique(mdata$year))] <- mean(x)
      df.final$mvar_poor[which(df.final$country == unique(mdata$country) &
                                 df.final$year == unique(mdata$year))] <-
        sd(x.lessaffluent)^2
      df.final$mvar_midlo[which(df.final$country == unique(mdata$country) &
                                  df.final$year == unique(mdata$year))] <-
        sd(x.midloaffluent)^2
      df.final$mvar_mid[which(df.final$country == unique(mdata$country) &
                                df.final$year == unique(mdata$year))] <-
        sd(x.midaffluent)^2
      df.final$mvar_midhi[which(df.final$country == unique(mdata$country) &
                                  df.final$year == unique(mdata$year))] <-
        sd(x.midhiaffluent)^2
      df.final$mvar_rich[which(df.final$country == unique(mdata$country) &
                                 df.final$year == unique(mdata$year))] <-
        sd(x.moreaffluent)^2
      df.final$mmean_poor[which(df.final$country == unique(mdata$country) &
                                  df.final$year == unique(mdata$year))] <-
        mean(x.lessaffluent)
      df.final$mmean_midlo[which(df.final$country == unique(mdata$country) &
                                   df.final$year == unique(mdata$year))] <-
        mean(x.midloaffluent)
      df.final$mmean_mid[which(df.final$country == unique(mdata$country) &
                                 df.final$year == unique(mdata$year))] <-
        mean(x.midaffluent)
      df.final$mmean_midhi[which(df.final$country == unique(mdata$country) &
                                   df.final$year == unique(mdata$year))] <-
        mean(x.midhiaffluent)
      df.final$mmean_rich[which(df.final$country == unique(mdata$country) &
                                  df.final$year == unique(mdata$year))] <-
        mean(x.moreaffluent)
      
      ### Store meta-data
      df.final$nobs_mass_all[which(df.final$country == unique(mdata$country) &
                                     df.final$year == unique(mdata$year))] <-
        length(x)
      df.final$nobs_mass_poor[which(df.final$country == unique(mdata$country) &
                                      df.final$year == unique(mdata$year))] <-
        length(x.lessaffluent)
      df.final$nobs_mass_midlo[which(df.final$country == unique(mdata$country) &
                                       df.final$year == unique(mdata$year))] <-
        length(x.midloaffluent)
      df.final$nobs_mass_mid[which(df.final$country == unique(mdata$country) &
                                     df.final$year == unique(mdata$year))] <-
        length(x.midaffluent)
      df.final$nobs_mass_midhi[which(df.final$country == unique(mdata$country) &
                                       df.final$year == unique(mdata$year))] <-
        length(x.midhiaffluent)
      df.final$nobs_mass_rich[which(df.final$country == unique(mdata$country) &
                                      df.final$year == unique(mdata$year))] <-
        length(x.moreaffluent)
      df.final$nobs_elite[which(df.final$country == unique(mdata$country) &
                                  df.final$year == unique(mdata$year))] <-
        length(y)
      df.final$mscale[which(df.final$country == unique(mdata$country) &
                              df.final$year == unique(mdata$year))] <-
        paste(unique(mdata$ideology_scale_orig),collapse=", ")
      df.final$escale[which(df.final$country == unique(mdata$country) &
                              df.final$year == unique(mdata$year))] <-
        paste(unique(edata$ideology_scale_orig),collapse=", ")
    }
  }
  setTxtProgressBar(pb, i)
}

### run the loop on "no MEP" data
df.final$mscale_nomep <- df.final$escale_nomep <- df.final$esurvey_nomep <-
  df.final$emd_all_nomep <- df.final$emd_lessaffluent_nomep <-
  df.final$emd_midloaffluent_nomep <- df.final$emd_midaffluent_nomep <- 
  df.final$emd_midhiaffluent_nomep <- df.final$emd_moreaffluent_nomep <-
  df.final$nobs_elite_nomep <- df.final$emean_nomep <- df.final$evar_nomep <-
  df.final$emd_lessaffluent_hiinfo_nomep <-
  df.final$emd_lessaffluent_loinfo_nomep <-
  df.final$emd_midloaffluent_hiinfo_nomep <-
  df.final$emd_midloaffluent_loinfo_nomep <-
  df.final$emd_midaffluent_hiinfo_nomep <-
  df.final$emd_midaffluent_loinfo_nomep <-
  df.final$emd_midhiaffluent_hiinfo_nomep <-
  df.final$emd_midhiaffluent_loinfo_nomep <-
  df.final$emd_moreaffluent_hiinfo_nomep <-
  df.final$emd_moreaffluent_loinfo_nomep <-
  rep(NA,nrow(df.final))
elist <- as.list(NULL)
# break out the data
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  elist[[i]] <- elite.nomeps[which(elite.nomeps$country == df.final$country[i] &
                                     elite.nomeps$year == df.final$year[i]),]
  if(nrow(elist[[i]]) > 0){
    df.final$esurvey_nomep[i] <-  paste0(unique(elist[[i]]$survey),
                                         collapse=", ")
  }
  setTxtProgressBar(pb, i)
}
# run it
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  
  if(nrow(edata[which(!is.na(edata$ideology)),]) == 0 |
     nrow(mdata[which(!is.na(mdata$ideology)),]) == 0){
    next
  } else {
    # create a temporary data frame of class measures, ranked by our preference
    class <- data.frame(cbind(mdata$country, mdata$wealth, mdata$income,
                              mdata$occupation, mdata$education))
    colnames(class) <- c("country","wealth","income","occupation","education")
    # delete all NAs
    for(j in ncol(class):2){
      if(all(is.na(unique(class[,j]))) == T){
        class <- class[,-j]
      }
    }
    # make sure the loop uses our best class measure
    whichclass <- colnames(class)[2] # since 1 is country
    colnames(mdata)[grep(whichclass, names(mdata))] <- "class"
    
    # create temporary data for knowledge
    info <- data.frame(cbind(mdata$country, mdata$knowledge, mdata$education))
    colnames(info) <- c("country","knowledge","education")
    for(j in ncol(info):2){
      if(all(is.na(unique(info[,j]))) == T){
        info <- info[,-j]
      }
    }
    # make sure the loop uses our best class measure
    whichinfo <- colnames(info)[2]
    colnames(mdata)[grep(whichinfo, names(mdata))] <- "know"
    
    ### Elite
    y <- as.matrix(na.omit(edata$ideology_scaled))
    w.y <- as.matrix(rep(1/nrow(data.frame(na.omit(edata$ideology_scaled))),
                         nrow(data.frame(na.omit(edata$ideology_scaled)))))
    
    ### All mass
    x <- as.matrix(na.omit(mdata$ideology_scaled))
    w.x <- as.matrix(rep(1/nrow(data.frame(na.omit(mdata$ideology_scaled))),
                         nrow(data.frame(na.omit(mdata$ideology_scaled)))))
    
    ### Less affluent mass
    x.lessaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0")])))))
    
    ### Midlo affluent mass
    x.midloaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1")]))
    w.x.midloaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1")])))))
    
    ### Mid affluent mass
    x.midaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2")])))))
    
    ### Midhi affluent mass
    x.midloaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3")]))
    w.x.midloaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3")])))))
    
    ### More affluent mass
    x.moreaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4")])))))
    
    ### Less affluent mass, high info
    x.lessaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "4")]))
    w.x.lessaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0" &
                                      mdata$know == "4")])))))
    
    ### Midlo affluent mass, high info
    x.midloaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "4")]))
    w.x.midloaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1" &
                                      mdata$know == "4")])))))
    
    ### Mid affluent mass, high info
    x.midaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "4")]))
    w.x.midaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2" &
                                      mdata$know == "4")])))))
    
    ### Midhi affluent mass, high info
    x.midhiaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "4")]))
    w.x.midhiaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3" &
                                      mdata$know == "4")])))))
    
    ### More affluent mass, high info
    x.moreaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "4")]))
    w.x.moreaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4" &
                                      mdata$know == "4")])))))
    
    ### Less affluent mass, low info
    x.lessaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "0")]))
    w.x.lessaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0" &
                                      mdata$know == "0")])))))
    
    ### Midlo affluent mass, low info
    x.midloaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "0")]))
    w.x.midloaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1" &
                                      mdata$know == "0")])))))
    
    ### Mid affluent mass, low info
    x.midaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "0")]))
    w.x.midaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2" &
                                      mdata$know == "0")])))))
    
    ### Midhi affluent mass, low info
    x.midhiaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "0")]))
    w.x.midhiaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3" &
                                      mdata$know == "0")])))))
    
    ### More affluent mass, low info
    x.moreaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "0")]))
    w.x.moreaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4" &
                                      mdata$know == "0")])))))
    
    # ensure everything is going correctly
    if(length(x.lessaffluent) == 0 | length(x.moreaffluent) == 0 |
       length(y) == 0){
      warning("check the if-length condition!")
      next
    } else {
      # some defensive programming checks
      stopifnot(all.equal(sum(w.x), 1))
      stopifnot(all.equal(sum(w.y), 1))
      if(length(x.lessaffluent) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent), 1))
      }
      if(length(x.midloaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent), 1))
      }
      if(length(x.midaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent), 1))
      }
      if(length(x.midhiaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent), 1))
      }
      if(length(x.moreaffluent) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent), 1))
      }
      if(length(x.lessaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent.lo), 1))
      }
      if(length(x.lessaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent.hi), 1))
      }
      if(length(x.midloaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent.lo), 1))
      }
      if(length(x.midloaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent.hi), 1))
      }
      if(length(x.midaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent.lo), 1))
      }
      if(length(x.midaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent.hi), 1))
      }
      if(length(x.midhiaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.lo), 1))
      }
      if(length(x.midhiaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.hi), 1))
      }
      if(length(x.moreaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent.lo), 1))
      }
      if(length(x.moreaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent.hi), 1))
      }
      
      # EMD for all
      df.final$emd_all_nomep[which(df.final$country == unique(mdata$country) &
                                     df.final$year == unique(mdata$year))] <-
        emdw(x,w.x,y,w.y,max.iter = 100000)
      
      # EMD by affluence
      df.final$emd_lessaffluent_nomep[which(df.final$country ==
                                              unique(mdata$country) &
                                              df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midloaffluent_nomep[which(df.final$country ==
                                               unique(mdata$country) &
                                               df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent,w.x.midloaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midaffluent_nomep[which(df.final$country ==
                                             unique(mdata$country) &
                                             df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midhiaffluent_nomep[which(df.final$country ==
                                               unique(mdata$country) &
                                               df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent,w.x.midhiaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_moreaffluent_nomep[which(df.final$country ==
                                              unique(mdata$country) &
                                              df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
      
      # EMD, less affluent, by political knowledge
      df.final$emd_lessaffluent_hiinfo_nomep[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent.hi,w.x.lessaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_lessaffluent_loinfo_nomep[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year == unique(mdata$year))] <-
        emdw(x.lessaffluent.lo,w.x.lessaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, midlo affluent, by political knowledge
      df.final$emd_midloaffluent_hiinfo_nomep[which(df.final$country ==
                                                      unique(mdata$country) &
                                                      df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent.hi,w.x.midloaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midloaffluent_loinfo_nomep[which(df.final$country ==
                                                      unique(mdata$country) &
                                                      df.final$year == unique(mdata$year))] <-
        emdw(x.midloaffluent.lo,w.x.midloaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, mid affluent, by political knowledge
      df.final$emd_midaffluent_hiinfo_nomep[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent.hi,w.x.midaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midaffluent_loinfo_nomep[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year == unique(mdata$year))] <-
        emdw(x.midaffluent.lo,w.x.midaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, midhi affluent, by political knowledge
      df.final$emd_midhiaffluent_hiinfo_nomep[which(df.final$country ==
                                                      unique(mdata$country) &
                                                      df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent.hi,w.x.midhiaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midhiaffluent_loinfo_nomep[which(df.final$country ==
                                                      unique(mdata$country) &
                                                      df.final$year == unique(mdata$year))] <-
        emdw(x.midhiaffluent.lo,w.x.midhiaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, more affluent, by political knowledge
      df.final$emd_moreaffluent_hiinfo_nomep[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent.hi,w.x.moreaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_moreaffluent_loinfo_nomep[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year == unique(mdata$year))] <-
        emdw(x.moreaffluent.lo,w.x.moreaffluent.lo,y,w.y,max.iter = 100000)
      
      ### store mean and variance
      df.final$evar_nomep[which(df.final$country == unique(mdata$country) &
                                  df.final$year == unique(mdata$year))] <-
        sd(y)^2
      df.final$emean_nomep[which(df.final$country == unique(mdata$country) &
                                   df.final$year == unique(mdata$year))] <-
        mean(y)
      
      ### Store meta-data
      df.final$nobs_elite_nomep[which(df.final$country ==
                                        unique(mdata$country) &
                                        df.final$year == unique(mdata$year))] <-
        length(y)
      df.final$mscale_nomep[which(df.final$country ==
                                    unique(mdata$country) &
                                    df.final$year == unique(mdata$year))] <-
        paste(unique(mdata$ideology_scale_orig),collapse=", ")
      df.final$escale_nomep[which(df.final$country ==
                                    unique(mdata$country) &
                                    df.final$year == unique(mdata$year))] <-
        paste(unique(edata$ideology_scale_orig),collapse=", ")
    }
  }
  setTxtProgressBar(pb, i)
}
# fix a few NaNs that arise from small samples
df.final[is.na(df.final)] <- NA
# trim all the white space -- makes the next loops faster
df.final <- df.final[!is.na(df.final$emd_all_full),]
# checkpoint the final data
write.csv(df.final, "./final/final-many-to-many.csv", row.names = F)
# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

##### ADD COVARIATES #####

### V-DEM 
vd <- read.csv("./VDEM/V-Dem-DS-CY+Others-v7.1.csv", stringsAsFactors = F)
vd <- vd[,c("country_name","year","v2eldonate","v2elpubfin",
            "v2x_polyarchy","v2x_libdem","v2x_delibdem","v2x_egaldem",
            "v2x_partipdem","v2elcomvot","v2elvaptrn","v2elvaptrn_leg",
            "v2eltrnout","v2eltrnout_leg","e_dpi_erlc", "v2xps_party",
            "v2x_cspart","v2xcs_ccsi","v2x_corr","v2lgcrrpt","e_ti_cpi",
            "v2psprlnks","e_migdppc")]
names(vd) <- c("country", "year", "vdem_donate","vdem_pubfin","vdem_polyarchy",
               "vdem_libdem","vdem_delibdem","vdem_egaldem","vdem_partipdem",
               "vdem_compulsory","vdem_turnout_vap","vdem_turnout_vap_leg",
               "vdem_turnout_reg","vdem_turnout_reg_leg","vdem_left_gov",
               "vdem_party_inst","vdem_cs_partic","vdem_ccso_index",
               "vdem_corr_index","vdem_corr_leg","vdem_corr_ti_cpi",
               "vdem_clientelism","vdem_gdp_maddison")
# clean country names
vd$country[which(vd$country == "Slovakia")] <- "Slovak Republic"
vd$country[which(vd$country == "Venezuela")] <- "Venezuela, RB"
# carry forward turnout rates
tmp <- lapply(split(vd, vd$country), function(x) { 
  
  ### VAP turnout
  ind = which(!is.na(x$vdem_turnout_vap))
  if(is.na(x$vdem_turnout_vap[1])){ind = c(1,ind)}
  x$vdem_turnout_vap <- rep(x$vdem_turnout_vap[ind], 
                        diff(c(ind, length(x$vdem_turnout_vap) + 1)))
  
  ### VAP legislative turnout
  ind = which(!is.na(x$vdem_turnout_vap_leg))
  if(is.na(x$vdem_turnout_vap_leg[1])){ind = c(1,ind)}
  x$vdem_turnout_vap_leg <- rep(x$vdem_turnout_vap_leg[ind], 
                            diff(c(ind, length(x$vdem_turnout_vap_leg) + 1)))
  
  ### registered voter turnout
  ind = which(!is.na(x$vdem_turnout_reg))
  if(is.na(x$vdem_turnout_reg[1])){ind = c(1,ind)}
  x$vdem_turnout_reg <- rep(x$vdem_turnout_reg[ind], 
                            diff(c(ind, length(x$vdem_turnout_reg) + 1)))
  
  ### registered legislative turnout
  ind = which(!is.na(x$vdem_turnout_reg_leg))
  if(is.na(x$vdem_turnout_reg_leg[1])){ind = c(1,ind)}
  x$vdem_turnout_reg_leg <- rep(x$vdem_turnout_reg_leg[ind], 
                                diff(c(ind, length(x$vdem_turnout_reg_leg) + 1)))
  
  return(x)
})
vd <- do.call(rbind, tmp)
# remove excess data
tmp <- vector("list", nrow(df.final))
for(i in 1:length(tmp)){
  tmp[[i]] <- vd[which(vd$country == df.final$country[i] & 
                         vd$year == df.final$year[i]),]
}
vd <- do.call(rbind, tmp)
# factor democracy
facs <- PCA(vd[,c(6:9)], ncp = 5, graph = F)
vd$vdem_democracy_factored <-facs$ind$coord[,1] 
# fix compulsory voting to be 0/1
vd$vdem_compulsory[which(vd$vdem_compulsory > 1)] <- 1
# fix Hungary 2013 -- not compulsory to vote
vd$vdem_compulsory[which(vd$country == "Hungary" & vd$year == 2013)] <- 0
# fix the turnout variables to be proportions
vd$vdem_turnout_vap <- vd$vdem_turnout_vap/100
vd$vdem_turnout_vap_leg <- vd$vdem_turnout_vap_leg/100
vd$vdem_turnout_reg <- vd$vdem_turnout_reg/100
vd$vdem_turnout_reg_leg <- vd$vdem_turnout_reg_leg/100
# fix leftist government into binary
vd$vdem_left_gov[which(vd$vdem_left_gov < 3)] <- 0
vd$vdem_left_gov[which(vd$vdem_left_gov == 3)] <- 1
# merge to main data
df.final <- merge(df.final, vd, by = c("country", "year"), all = T)
# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

### merge Dreher's international capital data
dr <- read_xlsx("./Dreher/Data_2018_2.xlsx")
# all variables clean so just subset, rename, and merge
dr <- dr[,c("country", "year", "KOFTrGIdf", "KOFFiGIdf", "KOFIpGIdf", 
            "KOFInGIdf", "KOFCuGIdf", "KOFPoGIdf")]
names(dr)[3:8] <- c("trade_globalization", "financial_globalization",
                    "personal_globalization", "information_globalization",
                    "cultural_globalization", "political_globalization")
df.final <- merge(df.final, dr, by = c("country", "year"), all.x = T)
rm(dr)

### merge Selway's data on cleavage structures
sel <- read_xls("./Selway/Aug 2013 Includes 155 Country Dataset.xls", sheet = 2)
# subset to just the cross-cuttingness variables - not sure why only 12 here
sel <- sel[,c("country", "RaRC", "RaGC", "RaIC", "ERC", "EGC", "EIC", "LRC",
              "LGC", "LIC", "RGC", "RIC", "IGC")]
# fix names
names(sel) <- c("country", "xcut_race_religion", "xcut_race_geog", 
                "xcut_race_income", "xcut_ethnicity_religion", 
                "xcut_ethnicity_geog", "xcut_ethnicity_income", 
                "xcut_lang_religion", "xcut_lang_geog", "xcut_lang_income", 
                "xcut_religion_geog", "xcut_religion_income", 
                "xcut_income_geog")
# fix a few country names
sel$country[which(sel$country == "Cyprus (Republic)")] <- "Cyprus"
sel$country[which(sel$country == "Great Britain")] <- "United Kingdom"
sel$country[which(sel$country == "Slovakia")] <- "Slovak Republic"
sel$country[which(sel$country == "Venezuela")] <- "Venezuela, RB"
# merge
df.final <- merge(df.final, sel, by = c("country"), all.x = T)
rm(sel)

### merge ILO data on trade unions
ilo1 <- as.data.frame(read_xls("./ILO/ILO-collective-bargaining-coverage.xls"))
ilo2 <- as.data.frame(read_xls("./ILO/ILO-trade-union-density.xls"))
# clean names and subset to variables/rows we need
names(ilo1) <- names(ilo2) <- c("country", "survey", "type", 
                                paste("year_", 2000:2016, sep = ""))
ilo1 <- ilo1[-c(1:6),-c(2:3)]; ilo2 <- ilo2[-c(1:6),-c(2:3)]
# reshape
ilo1 <- reshape(ilo1, direction = "long", 
                varying = list(names(ilo1)[which(grepl("year_", names(ilo1)))]),
                v.names = "coll_bargaining_pc", idvar = "country",
                timevar = "year", times = seq(2000, 2016))
ilo2 <- reshape(ilo2, direction = "long", 
                varying = list(names(ilo2)[which(grepl("year_", names(ilo2)))]),
                v.names = "trade_union_density", idvar = "country",
                timevar = "year", times = seq(2000, 2016))
# merge ILO data together
ilo <- merge(ilo1, ilo2, by = c("country", "year"), all = T)
rm(ilo1, ilo2)
# fix country names
ilo$country[which(ilo$country == "Slovakia")] <- "Slovak Republic"
ilo$country[which(ilo$country == "Venezuela, Bolivarian Republic of")] <- 
  "Venezuela, RB"
# merge
df.final <- merge(df.final, ilo, by = c("country", "year"), all.x = T)
rm(ilo)

### merge Dreher corruption data
dr <- as.data.frame(read_xls("./Dreher/corruption index.xls", sheet = 2))
# remove the non-parsimonious model, since there's only one period of it
dr$`1991-1997` <- NULL
# fix NAs and types
dr[dr=="."] <- NA
for(i in 1:ncol(dr)){
  dr[,i] <- as.character(dr[,i])
}
# split into list so we can expand out each time series
dl <- vector("list", ncol(dr)-1)
for(i in 1:length(dl)){
  # subset the data
  dl[[i]] <- dr[,c(1,(i+1))]
  # get the year range and clean
  yr <- names(dl[[i]])[2]
  yr <- gsub("p$", "", yr)
  years <- seq(gsub("([0-9]{4})(-.*)", "\\1", yr), 
               gsub("(.*-)([0-9]{4})", "\\2", yr), 1)
  # replicate and bind
  dl[[i]] <- as.data.frame(cbind(dl[[i]]$country, 
                                 matrix(rep(dl[[i]][,2], length(years)), 
                                        ncol = length(years))))
  names(dl[[i]]) <- c("country", paste("year_", years, sep = ""))
  # reshape
  dl[[i]] <- reshape(dl[[i]], direction = "long", 
                     varying = list(names(dl[[i]])[which(grepl("year_", 
                                                            names(dl[[i]])))]),
                     v.names = "dreher_corruption", idvar = "country",
                     timevar = "year", times = years)
}
# merge and fix classes
dr <- do.call(rbind, dl)
dr$country <- as.character(dr$country)
dr$year <- as.character(dr$year)
dr$dreher_corruption <- as.numeric(as.character(dr$dreher_corruption))
# merge
df.final <- merge(df.final, dr, by = c("country", "year"), all.x = T)
# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

### merge Boix-Miller-Rosato
bmr <- read.csv("./BMR/democracy-v3.0.csv", stringsAsFactors = F)
# fix country names
bmr$country[which(bmr$country == "SLOVAKIA")] <- "SLOVAK REPUBLIC"
bmr$country[which(bmr$country == "VENEZUELA")] <- "VENEZUELA, RB"
# subset to vars we need
bmr <- bmr[,c("country", "year", "democracy_duration")]
names(bmr)[1] <- "ccupper"
# create an uppercase version of df.final's country names
df.final$ccupper <- toupper(df.final$country)
# merge
df.final <- merge(df.final, bmr, by = c("ccupper", "year"), all.x = T)
# delete merge variable
df.final$ccupper <- NULL
rm(bmr)

### merge DALP data
dalp <- read.csv("./DALP/countrylevel_20130907.csv", stringsAsFactors = F)
dalp <- dalp[,c("country", "cosalpo_4nwe", "cosalpo_3econnwe")]
names(dalp) <- c("country", "dalp_clientelism", "dalp_clientelism_econ")
# fix country names
dalp$country[which(dalp$country == "Czech Rep.")] <- "Czech Republic"
dalp$country[which(dalp$country == "Dom. Rep.")] <- "Dominican Republic"
dalp$country[which(dalp$country == "Slovakia")] <- "Slovak Republic"
dalp$country[which(dalp$country == "UK")] <- "United Kingdom"
dalp$country[which(dalp$country == "Venezuela")] <- "Venezuela, RB"
# merge
df.final <- merge(df.final, dalp, by = c("country"), all.x = T)
rm(dalp)

### merge SWIID data
load("./SWIID/swiid8_0.rda")
sw <- as.data.frame(swiid_summary); rm(swiid_summary, swiid)
sw <- sw[,c("country", "year", "gini_disp")]
sw$country[which(sw$country == "Slovakia")] <- "Slovak Republic"
sw$country[which(sw$country == "Venezuela")] <- "Venezuela, RB"
sw$year <- as.character(sw$year)
df.final <- merge(df.final, sw, by = c("country", "year"), all.x = T)
rm(sw)

### merge QOG data
qog <- read.csv("./QOG/qog_std_ts_jan19.csv", stringsAsFactors = F)
qog <- qog[,c("cname", "year", "undp_hdi", "vi_wcoord")]
names(qog)[1] <- "country"
# fix country names
qog$country[which(qog$country == "Cyprus (1975-)")] <- "Cyprus"
qog$country[which(qog$country == "France (1963-)")] <- "France"
qog$country[which(qog$country == "Slovakia")] <- "Slovak Republic"
qog$country[which(qog$country == "Venezuela")] <- "Venezuela, RB"
# merge
df.final <- merge(df.final, qog, by = c("country", "year"), all.x = T)
rm(qog)

### merge ICTWSS
ic <- read.csv("./ICTWSS/ictwss_v5.1.csv", stringsAsFactors = F)
if("X.country" %in% names(ic)){
  ic <- ic[,c("X.country", "year", "cent", "conc")]
} else {
  ic <- ic[,c("country", "year", "cent", "conc")]
}
names(ic) <- c("country", "year", "ictwss_cent", "ictwss_conc")
ic$year <- as.character(ic$year)
df.final <- merge(df.final, ic, by = c("country", "year"), all.x = T)
rm(ic)

### merge IDEA campaign finance
# load data
idea <- read.csv("./IDEA-pol-finance/Political-Finance-Data.csv")
for(i in 1:ncol(idea)){
  idea[,i] <- as.character(idea[,i])
}
colnames(idea) <- idea[1,]
idea <- idea[-1,]
idea[idea == "No, but specific limit"] <- "2"
idea[idea == "Yes"] <- "1"
idea[idea == "Regular limit applies"] <- "1"
idea[idea == "No"] <- "0"
idea[idea == "No data"] <- NA
idea[idea == "Not applicable"] <- NA
# pick only questions we want
idea <- idea[,c(1,5,17,33)]
colnames(idea) <- c("country","ban_corp_don_parties","contrib_limit_parties",
                    "limit_party_spending")
# MCA on campaign finance
tmp <- idea[complete.cases(idea),]
for(i in c(2:4)){
  tmp[,i] <- as.factor(tmp[,i])
}
tmp$camp_fin <- MCA(tmp[,c(2:4)], ncp = 5, graph = F)$ind$coord[,1]
tmp <- tmp[,c("country","camp_fin")]
tmp$camp_fin <- as.character(tmp$camp_fin)
# fix country and merge
idea <- merge(idea,tmp,by="country",all=T)
idea$country[which(idea$country == "Slovakia")] <- "Slovak Republic"
idea$country[which(idea$country == "Venezuela")] <- "Venezuela, RB"
# additive index
idea$camp_fin_add <- as.numeric(idea$ban_corp_don_parties) +
  as.numeric(idea$contrib_limit_parties) + as.numeric(idea$limit_party_spending)
# merge
df.final <- merge(df.final,idea,all.x=T,all.y=F,by="country")
# fix classes
fins <- which(colnames(df.final) %in% colnames(idea)[2:length(colnames(idea))])
for(i in fins){
  df.final[,i] <- as.numeric(as.character(df.final[,i]))
}
# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

### IDEA time-series data
tmp <- read.csv("./IDEA-pol-finance/idea-time-series.csv", stringsAsFactors = F)
names(tmp)[1] <- "country"
tmp$country[which(tmp$country == "Slovakia")] <- "Slovak Republic"
tmp$country[which(tmp$country == "Venezuela")] <- "Venezuela, RB"
tmp <- tmp[,c("country", "year", "ok_ban_corp_don_parties", 
              "ok_contrib_limit_parties", "ok_limit_party_spending")]
names(tmp)[c(3:5)] <- gsub("ok","ts",names(tmp)[c(3:5)])
df.final <- merge(df.final, tmp, by=c("country", "year"), all.x = T, all.y = F)

### manually code region
df.final$region <- rep(NA,nrow(df.final))
euro <- c("Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus",
          "Czech Republic", "Denmark", "Estonia", "Finland", "France",
          "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Italy",
          "Latvia", "Lithuania", "Luxembourg", "Malta", "Netherlands", "Norway",
          "Poland", "Portugal", "Romania", "Slovak Republic", "Slovenia",
          "Spain", "Sweden", "Switzerland", "Ukraine", "United Kingdom")
latam <- c("Argentina", "Bolivia", "Brazil", "Chile", "Colombia", "Costa Rica",
           "Dominican Republic", "Ecuador", "El Salvador", "Guatemala",
           "Honduras", "Mexico", "Nicaragua", "Panama", "Paraguay", "Peru",
           "Uruguay", "Venezuela, RB")
asia <- c("Australia", "Japan")
for(i in 1:length(euro)){
  df.final$region[which(df.final$country == euro[i])] <- "Europe"
}
for(i in 1:length(latam)){
  df.final$region[which(df.final$country == latam[i])] <- "Latin America"
}
for(i in 1:length(asia)){
  df.final$region[which(df.final$country == asia[i])] <- "Australasia"
}

### compulsory voting, hand-coded from IDEA
df.final$compulsory <- rep("0",nrow(df.final))
df.final$compulsory[which(df.final$country == "Argentina")] <- "1"
df.final$compulsory[which(df.final$country == "Australia")] <- "1"
df.final$compulsory[which(df.final$country == "Belgium")] <- "1"
df.final$compulsory[which(df.final$country == "Bolivia")] <- "1"
df.final$compulsory[which(df.final$country == "Brazil")] <- "1"
df.final$compulsory[which(df.final$country == "Chile" &
                            df.final$year < "2012")] <- "1"
df.final$compulsory[which(df.final$country == "Costa Rica")] <- "1"
df.final$compulsory[which(df.final$country == "Cyprus")] <- "1" # per IDEA
df.final$compulsory[which(df.final$country == "Dominican Republic")] <- "1"
df.final$compulsory[which(df.final$country == "Ecuador")] <- "1"
df.final$compulsory[which(df.final$country == "Greece")] <- "1"
df.final$compulsory[which(df.final$country == "Honduras")] <- "1"
df.final$compulsory[which(df.final$country == "Luxembourg")] <- "1"
df.final$compulsory[which(df.final$country == "Mexico")] <- "1"
df.final$compulsory[which(df.final$country == "Paraguay")] <- "1"
df.final$compulsory[which(df.final$country == "Peru")] <- "1"
df.final$compulsory[which(df.final$country == "Uruguay")] <- "1"

### Disproportionality data from Christopher Gandrud
# current as of May 2019
disprop <- source_data(url = "http://bit.ly/Ss6zDO")
disprop$iso2c <- NULL
disprop$country[which(disprop$country == "Bolivia, Plurinational State of")] <-
  "Bolivia"
disprop$country[which(disprop$country == "Venezuela, Bolivarian Republic of")] <-
  "Venezuela, RB"
disprop$country[which(disprop$country == "Slovakia")] <- "Slovak Republic"
df.final <- merge(df.final, disprop,all.x=T,all.y=F,by=c("country","year"))

### Variables from DPI
dpi <- read.dta("./DPI2015/DPI2015_stata12.dta")
for(i in 1:ncol(dpi)){
  dpi[,i] <- as.character(dpi[,i])
}
# fix NAs
dpi[dpi == "-999"] <- NA
dpi[dpi == "-999.00"] <- NA
dpi[dpi == "-88"] <- NA
# fix country names
dpi$countryname[dpi$countryname == "Czech Rep."] <- "Czech Republic"
dpi$countryname[dpi$countryname == "Dom. Rep."] <- "Dominican Republic"
dpi$countryname[dpi$countryname == "FRG/Germany"] <- "Germany"
dpi$countryname[dpi$countryname == "Slovakia"] <- "Slovak Republic"
dpi$countryname[dpi$countryname == "UK"] <- "United Kingdom"
dpi$countryname[dpi$countryname == "Venezuela"] <- "Venezuela, RB"
# get the variables we need - note the typo in the data with plurality
dpi <- dpi[,c("countryname","year","pr","system","pluralty","mdmh","partyage")] 
colnames(dpi) <- c("country", "year", "pr", "presidential", "fptp", "dist_mag", 
                               "mean_party_age")
# recode presidentialism
dpi$presidential[which(dpi$presidential == "0")] <- "1"
dpi$presidential[which(dpi$presidential == "2")] <- "0"
# merge back to main data
df.final <- merge(df.final, dpi, all.x = T, all.y = F, by = c("country","year"))
# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

### calculate % women in our data
elite$sex[which(elite$sex == "Male")] <- "0"
elite$sex[which(elite$sex == "Female")] <- "1"
elite$sex <- as.numeric(elite$sex)
df.final$pc_women_elite <- rep(NA,nrow(df.final))
for(i in 1:nrow(df.final)){
  tmp <- elite[which(elite$country == df.final$country[i] &
                       elite$year == df.final$year[i]),]
  tmp <- mean(tmp$sex, na.rm=T)
  df.final$pc_women_elite[i] <- tmp
}
df.final[is.na(df.final)] <- NA

### get women legislators from scraper and sources
wm <- read_dta("./women elites/impact123015.dta")
wm <- as_factor(wm)
wm <- as.data.frame(wm)
wm <- wm[,c("country","year","wom")]
wm[wm==""] <- NA
wm <- wm[!is.na(wm$country),]
# clean names
wm$country[which(wm$country == "Germany Federal Republic of")] <- "Germany"
wm$country[which(wm$country ==
                   "United Kingdom of Great Britain and Northern Irela")] <- 
  "United Kingdom"
wm$country[which(wm$country == "Slovakia")] <- "Slovak Republic"
wm$country[which(wm$country == "Venezuela")] <- "Venezuela, RB"
wm$country[which(wm$country == "Czechoslovakia")] <- "Czech Republic"
# merge
df.final <- merge(df.final, wm, all.x = T, all.y = F, by = c("country","year"))
# now use the scraped data
wmscr <- read.csv("./women elites/women-elites.csv", stringsAsFactors = F)
wmscr <- wmscr[,c("country","year","pc_women_elite_real")]
wmscr$country[which(wmscr$country == "Venezuela")] <- "Venezuela, RB"
wmscr$country[which(wmscr$country == "Slovakia")] <- "Slovak Republic"
# merge in
df.final <- merge(df.final, wmscr, all.x = T,all.y = F,by = c("country","year"))
# standardize
df.final$wom <- df.final$wom/100
df.final$pc_women_elite_real <- ifelse(is.na(df.final$pc_women_elite_real),
                                       df.final$wom,
                                       df.final$pc_women_elite_real)
df.final$wom <- NULL
# code a few that are missing manually...
# Germany 1996 missing from World Bank but no election since 1994, so use 1994
# Peru 2000 is the Fujimori interregnum, so just carry forward from 1999
df.final$pc_women_elite_real[which(df.final$country == "Germany" &
                                     df.final$year == "1996")] <- .04
df.final$pc_women_elite_real[which(df.final$country == "Germany" &
                                     df.final$year == "1999")] <- .309
df.final$pc_women_elite_real[which(df.final$country == "Germany" &
                                     df.final$year == "2000")] <- .309
df.final$pc_women_elite_real[which(df.final$country == "Germany" &
                                     df.final$year == "2001")] <- .311
df.final$pc_women_elite_real[which(df.final$country == "Germany" &
                                     df.final$year == "2002")] <- .322
df.final$pc_women_elite_real[which(df.final$country == "Germany" &
                                     df.final$year == "2003")] <- .322
df.final$pc_women_elite_real[which(df.final$country == "Peru" &
                                     df.final$year == "2000")] <- .108
df.final$pc_women_elite_real[which(df.final$country == "Switzerland" &
                                     df.final$year == "1975")] <- .061
df.final$pc_women_elite_real[which(df.final$country == "Switzerland" &
                                     df.final$year == "1979")] <- .061
# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

### CLEA turnout
load("./CLEA/clea_lc_20181119.rdata")
clea <- as.data.frame(clea_lc_20181119); rm(clea_lc_20181119)
# subset to just variables we need
clea <- clea[,c("ctr_n", "yr", "pev1", "vot1", "pev2", "vot2")]
names(clea) <- c("country", "year", "eligible1", "voted1", "eligible2", "voted2")
# fix classes and NAs
clea$year <- as.character(clea$year)
clea$eligible1 <- as.numeric(as.character(clea$eligible1))
clea$eligible2 <- as.numeric(as.character(clea$eligible2))
clea$voted1 <- as.numeric(as.character(clea$voted1))
clea$voted2 <- as.numeric(as.character(clea$voted2))
clea$eligible1[which(clea$eligible1 < -99)] <- NA
clea$eligible2[which(clea$eligible2 < -99)] <- NA
clea$voted1[which(clea$voted1 < -99)] <- NA
clea$voted2[which(clea$voted2 < -99)] <- NA
# fix country names
clea$country[which(clea$country == "UK")] <- "United Kingdom"
clea$country[which(clea$country == "Slovakia")] <- "Slovak Republic"
clea$country[which(clea$country == "Venezuela")] <- "Venezuela, RB"
# remove excess data
tmp <- vector("list", nrow(df.final))
for(i in 1:length(tmp)){
  tmp[[i]] <- clea[which(clea$country == df.final$country[i] & 
                         clea$year == df.final$year[i]),]
}
clea <- do.call(rbind, tmp)
# merge the rounds
clea$eligible <- clea$eligible1
clea$eligible <- ifelse(is.na(clea$eligible2),
                        clea$eligible, 
                        sum(clea$eligible + clea$eligible2, na.rm = T))
clea$voted <- clea$voted1
clea$voted <- ifelse(is.na(clea$voted2),
                        clea$voted, 
                        sum(clea$voted + clea$voted2, na.rm = T))
clea$eligible1 <- clea$eligible2 <- clea$voted1 <- clea$voted2 <- NULL
# drop NAs
clea <- na.omit(clea)
# drop duplicates, since these are from multiple candidates
clea <- clea[!duplicated(clea),]
# sum the votes by country-year
tmp <- split(clea, f = list(clea$country, clea$year), drop = T)
tmp <- lapply(tmp, function(x) {
  x <- aggregate(.~ country + year, x, sum)
  return(x)
})
clea <- do.call(rbind, tmp)
# create turnout variable
clea$clea_turnout <- clea$voted/clea$eligible
# clean up and merge
clea <- clea[,c("country", "year", "clea_turnout")]
df.final <- merge(df.final, clea, by = c("country", "year"), all = T)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

### CHES
ches <- read.csv("./CHES/1999-2014_CHES_dataset_means.csv", stringsAsFactors = F)
ches <- ches[,c("country", "electionyear", "govt", "seat", "lrgen")]
for(i in 1:ncol(ches)){
  ches[,i] <- as.character(ches[,i])
}
# first, fix countries
ins <- c("be", "dk", "ge", "gr", "esp", "fr", "irl", "it", "nl", "uk", "por",
         "aus", "fin", "sv", "bul", "cz", "est", "hun", "lat", "lith", "pol",
         "rom", "slo", "sle", "cro", "mal", "lux", "cyp")
outs <- c("Belgium", "Denmark", "Germany", "Greece", "Spain", "France", 
          "Ireland", "Italy", "Netherlands", "United Kingdom", "Portugal",
          "Austria", "Finland", "Sweden", "Bulgaria", "Czech Republic", 
          "Estonia", "Hungary", "Latvia", "Lithuania", "Poland", "Romania",
          "Slovak Republic", "Slovenia", "Croatia", "Malta", "Luxembourg", 
          "Cyprus")
for(i in 1:length(ins)){
  ches$country[which(ches$country == ins[i])] <- outs[i]  
}
# remove countries that were not in government
ches <- ches[which(ches$govt == "in government" | ches$govt == ".5"),]
# remove unnecessary vars and tidy
ches$year <- ches$electionyear
ches <- ches[order(ches$country, ches$year),
             c("country", "year", "seat", "lrgen")]
# fix ideology when it's at center and fix classes
ches$lrgen[which(ches$lrgen == "center")] <- "5"
ches$lrgen <- as.numeric(ches$lrgen)
ches$seat <- as.numeric(ches$seat)
# fix a few seats - PRG and DL in France 2002, NCD in Italy 2013
ches$seat[which(ches$country == "France" & ches$year == "2002" & 
                  ches$lrgen < 4 & is.na(ches$seat))] <- 7
ches$seat[which(ches$country == "France" & ches$year == "2002" & 
                  ches$lrgen > 6 & is.na(ches$seat))] <- 1
ches$seat[which(ches$country == "Italy" & ches$year == "2013" & 
                  is.na(ches$seat))] <- 0
# split out by country-year and make average gov't position
tmp <- split(ches, f = list(ches$country, ches$year), drop = T)
tmp <- lapply(tmp, function(x){
  x$left_gov <- x$seat*x$lrgen/sum(x$seat)
  x <- aggregate(left_gov ~ country + year, x, sum)
  return(x)
})
ches <- do.call(rbind, tmp)
# make into a panel
tmp <- split(ches, f = list(ches$country))
tmp <- lapply(tmp, function(x){
  min <- min(as.numeric(x$year))
  max <- max(as.numeric(x$year))
  if(min == max){
    return(x)
  } else {
    x <- merge(x, expand.grid(year = seq(min, max, 1), 
                              country = unique(x$country)), all = T)
    return(x)
  }
})
# fill in missing values
for(i in 1:length(tmp)){
  if(nrow(tmp[[i]]) == 1){
    next
  } else {
    for(j in 2:nrow(tmp[[i]])){
      if(is.na(tmp[[i]]$left_gov[j])){
        tmp[[i]]$left_gov[j] <- tmp[[i]]$left_gov[j-1]
      }
    } 
  }
}
ches <- do.call(rbind, tmp)
# merge and clean up
df.final <- merge(df.final, ches, by = c("country", "year"), 
                  all.x = T, all.y = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

### Manually carry forward ideology in cases where CHES data still make sense
df.final$left_gov[which(df.final$country == "Austria" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Austria" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Austria" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Austria" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Belgium" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Belgium" & df.final$year == "2014")]
df.final$left_gov[which(df.final$country == "Bulgaria" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Bulgaria" & df.final$year == "2014")]
df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Czech Republic" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Czech Republic" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Czech Republic" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Czech Republic" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Estonia" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "Estonia" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Estonia" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Estonia" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Estonia" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Estonia" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "France" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "France" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "France" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "France" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "France" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "France" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Germany" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Germany" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Germany" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Germany" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Greece" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Greece" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Greece" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Greece" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Greece" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Greece" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Hungary" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Hungary" & df.final$year == "2014")]
df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Italy" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Italy" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Italy" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Italy" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Latvia" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Latvia" & df.final$year == "2014")]
df.final$left_gov[which(df.final$country == "Lithuania" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Lithuania" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Lithuania" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Lithuania" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Lithuania" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Lithuania" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Malta" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Malta" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Malta" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Malta" & df.final$year == "2013")]
df.final$left_gov[which(df.final$country == "Netherlands" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Netherlands" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Netherlands" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Netherlands" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Netherlands" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Netherlands" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Poland" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "Poland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Poland" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Poland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Poland" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Poland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Poland" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Poland" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Romania" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Romania" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Romania" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Romania" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Romania" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Romania" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Slovak Republic" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Slovak Republic" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Slovak Republic" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Slovak Republic" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Slovak Republic" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Slovak Republic" & df.final$year == "2012")]
df.final$left_gov[which(df.final$country == "Slovenia" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Slovenia" & df.final$year == "2014")]
df.final$left_gov[which(df.final$country == "Spain" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "Spain" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Spain" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "Spain" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Spain" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "Spain" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Spain" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Spain" & df.final$year == "2011")]
df.final$left_gov[which(df.final$country == "Sweden" & df.final$year == "2015")] <- 
  df.final$left_gov[which(df.final$country == "Sweden" & df.final$year == "2014")]
df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year == "2011")] <- 
  df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year == "2010")]
df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year == "2012")] <- 
  df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year == "2010")]
df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year == "2013")] <- 
  df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year == "2010")]
df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year == "2014")] <- 
  df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year == "2010")]

### update left government with CMP data
cmp <- read.csv("./CMP/MPDataset_MPDS2018b.csv", stringsAsFactors = F)
cmp <- cmp[,c("countryname", "edate", "partyname", "absseat", "rile")]
names(cmp) <- c("country", "year", "party", "seats", "ideology")
# rescale ideology to be on the same scale as CHES
cmp$ideology <- rescalr(cmp$ideology, -100, 100, 0, 10)
# fix year
cmp$year <- gsub("(.*/.*/)(.*)", "\\2", cmp$year)
# rename one country
cmp$country[which(cmp$country == "Slovakia")] <- "Slovak Republic"
# Australia 2007
df.final$left_gov[which(df.final$country == "Australia" & df.final$year == "2007")] <-
  cmp$ideology[which(cmp$country == "Australia" & cmp$year == "2007" & 
                       cmp$party == "Australian Labor Party")]
# Austria 1996
t <- cmp[which(cmp$country == "Austria" & cmp$year == "1995" & cmp$party %in% 
                 c("Austrian Social Democratic Party", "Austrian People’s Party")),]
df.final$left_gov[which(df.final$country == "Austria" & df.final$year == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Belgium 1996
t <- cmp[which(cmp$country == "Belgium" & cmp$year == "1995" & cmp$party %in% 
                 c("Christian People’s Party","Francophone Socialist Party", 
                   "Flemish Socialist Party", "Christian Social Party")),]
df.final$left_gov[which(df.final$country == "Belgium" & df.final$year == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Croatia 2014, 2015 (2015 election wasn't until November, so carry forward)
df.final$left_gov[which(df.final$country == "Croatia" & df.final$year == "2014")] <-
  cmp$ideology[which(cmp$country == "Croatia" & cmp$year == "2011" & 
                       cmp$party == "Kukuriku Coalition")]
df.final$left_gov[which(df.final$country == "Croatia" & df.final$year == "2015")] <-
  cmp$ideology[which(cmp$country == "Croatia" & cmp$year == "2011" & 
                       cmp$party == "Kukuriku Coalition")]
# Cyprus 2004, 2005
df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2004")] <-
  cmp$ideology[which(cmp$country == "Cyprus" & cmp$year == "2001" & 
                       cmp$party == "Progressive Party of the Working People")]
df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year == "2005")] <-
  cmp$ideology[which(cmp$country == "Cyprus" & cmp$year == "2001" & 
                       cmp$party == "Progressive Party of the Working People")]
# Cyprus 2006-2010
t <- cmp[which(cmp$country == "Cyprus" & cmp$year == "2006" & cmp$party %in% 
                 c("Progressive Party of the Working People", 
                   "United Democratic Union of Cyprus","Democratic Party")),]
df.final$left_gov[which(df.final$country == "Cyprus" & df.final$year %in% 
                          c("2006", "2007", "2008", "2009", "2010"))] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Denmark 1996
t <- cmp[which(cmp$country == "Denmark" & cmp$year == "1994" & cmp$party %in% 
                 c("Social Democratic Party", "Danish Social-Liberal Party", 
                   "Centre Democrats")),]
df.final$left_gov[which(df.final$country == "Denmark" & df.final$year == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Estonia 2015
t <- cmp[which(cmp$country == "Estonia" & cmp$year == "2015" & cmp$party %in% 
                 c("Pro Patria and Res Publica Union","Estonian Reform Party",
                   "Social Democratic Party")),]
df.final$left_gov[which(df.final$country == "Estonia" & df.final$year == "2015")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Finland 1996
t <- cmp[which(cmp$country == "Finland" & cmp$year == "1995" & cmp$party %in% 
                 c("Finnish Social Democrats","National Coalition",
                   "Left Wing Alliance","Swedish People’s Party",
                   "Green Union")),]
df.final$left_gov[which(df.final$country == "Finland" & df.final$year == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# France 1967
df.final$left_gov[which(df.final$country == "France" & df.final$year == "1967")] <-
  cmp$ideology[which(cmp$country == "France" & cmp$year == "1967" & 
                       cmp$party == "Democratic Union of the Fifth Republic - Gaulists")]
# France 1996
t <- cmp[which(cmp$country == "France" & cmp$year == "1993" & cmp$party %in% 
                 c("Union for French Democracy","Rally for the Republic")),]
df.final$left_gov[which(df.final$country == "France" & df.final$year == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Germany 1996
t <- cmp[which(cmp$country == "Germany" & cmp$year == "1994" & cmp$party %in% 
                 c("Christian Democratic Union/Christian Social Union",
                   "Free Democratic Party")),]
df.final$left_gov[which(df.final$country == "Germany" & df.final$year == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Iceland 2009-2011
t <- cmp[which(cmp$country == "Iceland" & cmp$year == "2009" & cmp$party %in% 
                 c("The Alliance","Left Green Movement")),]
df.final$left_gov[which(df.final$country == "Iceland" & df.final$year %in% 
                          c("2009", "2010", "2011"))] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Iceland 2013
t <- cmp[which(cmp$country == "Iceland" & cmp$year == "2013" & cmp$party %in% 
                 c("Independence Party","Progressive Party")),]
df.final$left_gov[which(df.final$country == "Iceland" & df.final$year == "2013")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Ireland 1996
t <- cmp[which(cmp$country == "Ireland" & cmp$year == "1992" & cmp$party %in% 
                 c("Soldiers of Destiny","Labour Party")),]
df.final$left_gov[which(df.final$country == "Ireland" & df.final$year == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Japan 2003
t <- cmp[which(cmp$country == "Japan" & cmp$year == "2003" & cmp$party %in% 
                 c("Liberal Democratic Party","New Conservative Party",
                   "New Clean Government Party")),]
df.final$left_gov[which(df.final$country == "Japan" & df.final$year == "2003")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Japan 2005-2008
t <- cmp[which(cmp$country == "Japan" & cmp$year == "2005" & cmp$party %in% 
                 c("Liberal Democratic Party","New Clean Government Party")),]
df.final$left_gov[which(df.final$country == "Japan" & df.final$year %in% 
                          c("2005", "2006", "2007", "2008"))] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Japan 2013
t <- cmp[which(cmp$country == "Japan" & cmp$year == "2012" & cmp$party %in% 
                 c("Liberal Democratic Party","New Clean Government Party")),]
df.final$left_gov[which(df.final$country == "Japan" & df.final$year == "2013")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Luxembourg 1996
t <- cmp[which(cmp$country == "Luxembourg" & cmp$year == "1994" & cmp$party %in% 
                 c("Christian Social People’s Party",
                   "Socialist Workers’ Party of Luxembourg")),]
df.final$left_gov[which(df.final$country == "Luxembourg" & df.final$year == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Luxembourg 1999-2003
t <- cmp[which(cmp$country == "Luxembourg" & cmp$year == "1999" & cmp$party %in% 
                 c("Christian Social People’s Party", "Democratic Party")),]
df.final$left_gov[which(df.final$country == "Luxembourg" & df.final$year %in% 
                          c("1999", "2000", "2001", "2002", "2003"))] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Luxembourg 2004-2008
t <- cmp[which(cmp$country == "Luxembourg" & cmp$year == "2004" & cmp$party %in% 
                 c("Christian Social People’s Party",
                   "Socialist Workers’ Party of Luxembourg")),]
df.final$left_gov[which(df.final$country == "Luxembourg" & df.final$year %in% 
                          c("2004", "2005", "2006", "2007", "2008"))] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Luxembourg 2009
t <- cmp[which(cmp$country == "Luxembourg" & cmp$year == "2009" & cmp$party %in% 
                 c("Christian Social People’s Party",
                   "Socialist Workers’ Party of Luxembourg")),]
df.final$left_gov[which(df.final$country == "Luxembourg" & df.final$year  == "2009")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Netherlands 1996
t <- cmp[which(cmp$country == "Netherlands" & cmp$year == "1994" & cmp$party %in% 
                 c("Labour Party","People’s Party for Freedom and Democracy",
                   "Democrats‘66")),]
df.final$left_gov[which(df.final$country == "Netherlands" & df.final$year  == "1996")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Norway 2005, 2007
t <- cmp[which(cmp$country == "Norway" & cmp$year == "2005" & cmp$party %in% 
                 c("Norwegian Labour Party","Socialist Left Party",
                   "Centre Party")),]
df.final$left_gov[which(df.final$country == "Norway" & df.final$year  %in% 
                          c("2005", "2007"))] <-
  sum(t$ideology*t$seats/sum(t$seats))

# Norway 2009
t <- cmp[which(cmp$country == "Norway" & cmp$year == "2009" & cmp$party %in% 
                 c("Norwegian Labour Party","Socialist Left Party",
                   "Centre Party")),]
df.final$left_gov[which(df.final$country == "Norway" & df.final$year  == "2009")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Portugal 1996
df.final$left_gov[which(df.final$country == "Portugal" & df.final$year == "1996")] <-
  cmp$ideology[which(cmp$country == "Portugal" & cmp$year == "1995" & 
                       cmp$party == "Socialist Party")]
# Sweden 1985
df.final$left_gov[which(df.final$country == "Sweden" & df.final$year == "1985")] <-
  cmp$ideology[which(cmp$country == "Sweden" & cmp$year == "1985" & 
                       cmp$party == "Social Democratic Labour Party")]
# Sweden 1988
df.final$left_gov[which(df.final$country == "Sweden" & df.final$year == "1988")] <-
  cmp$ideology[which(cmp$country == "Sweden" & cmp$year == "1988" & 
                       cmp$party == "Social Democratic Labour Party")]
# Sweden 1991
t <- cmp[which(cmp$country == "Sweden" & cmp$year == "1991" & cmp$party %in% 
                 c("Moderate Coalition Party","Liberal People’s Party",
                   "Centre Party", "Christian Democratic Community Party")),]
df.final$left_gov[which(df.final$country == "Sweden" & df.final$year  == "1991")] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Sweden 1994-1997
t <- cmp[which(cmp$country == "Sweden" & cmp$year == "1994" & cmp$party %in% 
                 c("Social Democratic Labour Party","Left Party",
                   "Green Ecology Party")),]
df.final$left_gov[which(df.final$country == "Sweden" & df.final$year %in% 
                          c("1994", "1995", "1996", "1997"))] <-
  sum(t$ideology*t$seats/sum(t$seats))
# Switzerland 1975
df.final$left_gov[which(df.final$country == "Switzerland" & df.final$year == "1975")] <-
  cmp$ideology[which(cmp$country == "Switzerland" & cmp$year == "1975" & 
                       cmp$party == "Social Democratic Party of Switzerland")]
# Switzerland 1979
df.final$left_gov[which(df.final$country == "Switzerland" & df.final$year == "1979")] <-
  cmp$ideology[which(cmp$country == "Switzerland" & cmp$year == "1979" & 
                       cmp$party == "Social Democratic Party of Switzerland")]
# Switzerland 2007, 2010, 2011
df.final$left_gov[which(df.final$country == "Switzerland" & df.final$year %in% 
                          c("2007", "2010", "2011"))] <-
  cmp$ideology[which(cmp$country == "Switzerland" & cmp$year == "2007" & 
                       cmp$party == "Swiss People’s Party")]
# Ukraine 1998
df.final$left_gov[which(df.final$country == "Ukraine" & df.final$year == "1998")] <-
  cmp$ideology[which(cmp$country == "Ukraine" & cmp$year == "1998" & 
                       cmp$party == "Communist Party of Ukraine")]
# UK 1987-1991
df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year %in% 
                          c("1987", "1988", "1989", "1990", "1991"))] <-
  cmp$ideology[which(cmp$country == "United Kingdom" & cmp$year == "1987" & 
                       cmp$party == "Conservative Party")]
# UK 1992-1996
df.final$left_gov[which(df.final$country == "United Kingdom" & df.final$year %in% 
                          c("1992", "1993", "1994", "1995", "1996"))] <-
  cmp$ideology[which(cmp$country == "United Kingdom" & cmp$year == "1992" & 
                       cmp$party == "Conservative Party")]
# UK 2015
df.final$left_gov[which(df.final$country == "United Kingdom" & 
                          df.final$year == "2015")] <-
  cmp$ideology[which(cmp$country == "United Kingdom" & cmp$year == "2015" & 
                       cmp$party == "Conservative Party")]

### update left_gov using data from Baker and Greene
bg <- read_xlsx("./Baker-Greene/Website Data 2.0.xlsx")
bg <- bg[,c(1:5, 8)]
names(bg) <- c("country", "year", "acronym", "party", "coalition", "ideology")
bg <- as.data.frame(bg)
bg$ideology <- rescalr(bg$ideology, 0, 20, 0, 10)
# Argentina 1995
df.final$left_gov[which(df.final$country == "Argentina" & df.final$year == "1995")] <-
  bg$ideology[which(bg$country == "ARGENTINA" & bg$year == "1995" & bg$acronym == "PJ")]
# Argentina 1999
df.final$left_gov[which(df.final$country == "Argentina" & df.final$year == "1999")] <-
  bg$ideology[which(bg$country == "ARGENTINA" & bg$year == "1999" & bg$acronym == "ATJE")]
# Argentina 2006
df.final$left_gov[which(df.final$country == "Argentina" & df.final$year == "2006")] <-
  bg$ideology[which(bg$country == "ARGENTINA" & bg$year == "2003" & bg$acronym == "FplV")]
# Argentina 2008
df.final$left_gov[which(df.final$country == "Argentina" & df.final$year == "2008")] <-
  bg$ideology[which(bg$country == "ARGENTINA" & bg$year == "2007" & bg$acronym == "FplV")]
# Argentina 2013
df.final$left_gov[which(df.final$country == "Argentina" & 
                          df.final$year %in% c("2013", "2014"))] <-
  bg$ideology[which(bg$country == "ARGENTINA" & bg$year == "2011" & bg$acronym == "FplV")]
# Bolivia 2004, 2006
df.final$left_gov[which(df.final$country == "Bolivia" & df.final$year == "2004")] <-
  bg$ideology[which(bg$country == "BOLIVIA" & bg$year == "2002" & bg$acronym == "MNR/MBL")]
# Bolivia 2006, 2008
df.final$left_gov[which(df.final$country == "Bolivia" &
                          df.final$year %in% c("2006", "2008"))] <-
  bg$ideology[which(bg$country == "BOLIVIA" & bg$year == "2005" & bg$acronym == "MAS")]
# Brazil 2002
df.final$left_gov[which(df.final$country == "Brazil" & df.final$year  == "2002")] <-
  bg$ideology[which(bg$country == "BRAZIL" & bg$year == "2002" & bg$acronym == "PT")]
# Brazil 2006-2008
df.final$left_gov[which(df.final$country == "Brazil" & 
                          df.final$year  %in% c("2006", "2007", "2008"))] <-
  bg$ideology[which(bg$country == "BRAZIL" & bg$year == "2006" & bg$acronym == "PT")]
# Brazil 2010, 2012
df.final$left_gov[which(df.final$country == "Brazil" & 
                          df.final$year  %in% c("2010", "2012"))] <-
  bg$ideology[which(bg$country == "BRAZIL" & bg$year == "2010" & bg$acronym == "PT")]
# Brazil 2014
df.final$left_gov[which(df.final$country == "Brazil" & df.final$year  == "2014")] <-
  bg$ideology[which(bg$country == "BRAZIL" & bg$year == "2014" & bg$acronym == "PT")]
# Chile 1996
df.final$left_gov[which(df.final$country == "Chile" & df.final$year  == "1996")] <-
  bg$ideology[which(bg$country == "CHILE" & bg$year == "1993" & bg$acronym == "CPD")]
# Chile 1999-2000
df.final$left_gov[which(df.final$country == "Chile" & 
                          df.final$year  %in% c("1999", "2000"))] <-
  bg$ideology[which(bg$country == "CHILE" & bg$year == "1999" & bg$acronym == "PPD")]
# Chile 2005-2006, 2008
df.final$left_gov[which(df.final$country == "Chile" & 
                          df.final$year  %in% c("2005", "2006", "2008"))] <-
  bg$ideology[which(bg$country == "CHILE" & bg$year == "2005" & bg$acronym == "PS")]
# Chile 2009, 2011
df.final$left_gov[which(df.final$country == "Chile" & 
                          df.final$year  %in% c("2009", "2011"))] <-
  bg$ideology[which(bg$country == "CHILE" & bg$year == "2009" & bg$acronym == "RN")]
# Colombia 1998
df.final$left_gov[which(df.final$country == "Colombia" & df.final$year  == "1998")] <-
  bg$ideology[which(bg$country == "COLOMBIA" & bg$year == "1998" & bg$acronym == "PSC")]
# Colombia 2004-2005
df.final$left_gov[which(df.final$country == "Colombia" & 
                          df.final$year %in% c("2004", "2005"))] <-
  bg$ideology[which(bg$country == "COLOMBIA" & bg$year == "2002" & bg$acronym == "PC")]
# Colombia 2006, 2008
df.final$left_gov[which(df.final$country == "Colombia" & 
                          df.final$year %in% c("2006", "2008"))] <-
  bg$ideology[which(bg$country == "COLOMBIA" & bg$year == "2006" & bg$acronym == "PC")]
# Colombia 2010, 2012
df.final$left_gov[which(df.final$country == "Colombia" & 
                          df.final$year %in% c("2010", "2012"))] <-
  bg$ideology[which(bg$country == "COLOMBIA" & bg$year == "2010" & bg$acronym == "PSUN")]
# Colombia 2014
df.final$left_gov[which(df.final$country == "Colombia" & df.final$year == "2014")] <-
  bg$ideology[which(bg$country == "COLOMBIA" & bg$year == "2014" & bg$acronym == "PSUN")]
# Costa Rica 2004
df.final$left_gov[which(df.final$country == "Costa Rica" & df.final$year == "2004")] <-
  bg$ideology[which(bg$country == "COSTA RICA" & bg$year == "2002" & bg$acronym == "USC")]
# Costa Rica 2006, 2008, 
df.final$left_gov[which(df.final$country == "Costa Rica" & 
                          df.final$year %in% c("2006", "2008"))] <-
  bg$ideology[which(bg$country == "COSTA RICA" & bg$year == "2006" & bg$acronym == "PLN")]
# Costa Rica 2010, 2012
df.final$left_gov[which(df.final$country == "Costa Rica" & 
                          df.final$year %in% c("2010", "2012"))] <-
  bg$ideology[which(bg$country == "COSTA RICA" & bg$year == "2010" & bg$acronym == "LN")]
# Costa Rica 2014
df.final$left_gov[which(df.final$country == "Costa Rica" & df.final$year == "2014")] <-
  bg$ideology[which(bg$country == "COSTA RICA" & bg$year == "2014" & bg$acronym == "PAC")]
# Dominican Republic 1996 
df.final$left_gov[which(df.final$country == "Dominican Republic" & df.final$year == "1996")] <-
  bg$ideology[which(bg$country == "DOMINICAN REPUBLIC" & bg$year == "1996" & bg$acronym == "PLD")]
# Dominican Republic 2006
df.final$left_gov[which(df.final$country == "Dominican Republic" & df.final$year == "2006")] <-
  bg$ideology[which(bg$country == "DOMINICAN REPUBLIC" & bg$year == "2004" & bg$acronym == "PLD")]
# Dominican Republic 2008, 2010
df.final$left_gov[which(df.final$country == "Dominican Republic" & 
                          df.final$year %in% c("2008", "2010"))] <-
  bg$ideology[which(bg$country == "DOMINICAN REPUBLIC" & bg$year == "2008" & bg$acronym == "PLD")]
# Dominican Republic 2012, 2014
df.final$left_gov[which(df.final$country == "Dominican Republic" & 
                          df.final$year %in% c("2012", "2014"))] <-
  bg$ideology[which(bg$country == "DOMINICAN REPUBLIC" & bg$year == "2012" & bg$acronym == "PLD")]
# Ecuador 2004
df.final$left_gov[which(df.final$country == "Ecuador" & df.final$year == "2004")] <-
  bg$ideology[which(bg$country == "ECUADOR" & bg$year == "2002" & bg$acronym == "PSP")]
# Ecuador 2006, 2008 
df.final$left_gov[which(df.final$country == "Ecuador" & 
                          df.final$year %in% c("2006", "2008"))] <-
  bg$ideology[which(bg$country == "ECUADOR" & bg$year == "2006" & bg$acronym == "Alianza PAiS")]
# Ecuador 2010, 2012 
df.final$left_gov[which(df.final$country == "Ecuador" & 
                          df.final$year %in% c("2010", "2012"))] <-
  bg$ideology[which(bg$country == "ECUADOR" & bg$year == "2009" & bg$acronym == "Alianza PAiS")]
# Ecuador 2013
df.final$left_gov[which(df.final$country == "Ecuador" & df.final$year == "2013")] <-
  bg$ideology[which(bg$country == "ECUADOR" & bg$year == "2013" & bg$acronym == "Alianza PAiS")]
# El Salvador 1999
df.final$left_gov[which(df.final$country == "El Salvador" & df.final$year == "1999")] <-
  bg$ideology[which(bg$country == "EL SALVADOR" & bg$year == "1999" & bg$acronym == "ARENA")]
# El Salvador 2004, 2006, 2008
df.final$left_gov[which(df.final$country == "El Salvador" & 
                          df.final$year %in% c("2004", "2006", "2008"))] <-
  bg$ideology[which(bg$country == "EL SALVADOR" & bg$year == "2004" & bg$acronym == "ARENA")]
# El Salvador 2010, 2012
df.final$left_gov[which(df.final$country == "El Salvador" & 
                          df.final$year %in% c("2010", "2012"))] <-
  bg$ideology[which(bg$country == "EL SALVADOR" & bg$year == "2009" & bg$acronym == "FMLN")]
# El Salvador 2014
df.final$left_gov[which(df.final$country == "El Salvador" & df.final$year == "2014")] <-
  bg$ideology[which(bg$country == "EL SALVADOR" & bg$year == "2014" & bg$acronym == "FMLN")]
# Guatemala 2004, 2006 
df.final$left_gov[which(df.final$country == "Guatemala" & 
                          df.final$year %in% c("2004", "2006"))] <-
  bg$ideology[which(bg$country == "GUATEMALA" & bg$year == "2003" & bg$acronym == "PP-MR-PSN")]
# Guatemala 2008, 2010
df.final$left_gov[which(df.final$country == "Guatemala" & 
                          df.final$year %in% c("2008", "2010"))] <-
  bg$ideology[which(bg$country == "GUATEMALA" & bg$year == "2007" & bg$acronym == "UNE")]
# Guatemala 2012, 2014
df.final$left_gov[which(df.final$country == "Guatemala" & 
                          df.final$year %in% c("2012", "2014"))] <-
  bg$ideology[which(bg$country == "GUATEMALA" & bg$year == "2011" & bg$acronym == "PP")]
# Honduras 2004
df.final$left_gov[which(df.final$country == "Honduras" & df.final$year == "2004")] <-
  bg$ideology[which(bg$country == "HONDURAS" & bg$year == "2001" & bg$acronym == "PNH")]
# Honduras 2006, 2008 
df.final$left_gov[which(df.final$country == "Honduras" & 
                          df.final$year %in% c("2006", "2008"))] <-
  bg$ideology[which(bg$country == "HONDURAS" & bg$year == "2005" & bg$acronym == "PLH")]
# Honduras 2010, 2012
df.final$left_gov[which(df.final$country == "Honduras" & 
                          df.final$year %in% c("2010", "2012"))] <-
  bg$ideology[which(bg$country == "HONDURAS" & bg$year == "2009" & bg$acronym == "PNH")]
# Honduras 2014
df.final$left_gov[which(df.final$country == "Honduras" & df.final$year == "2014")] <-
  bg$ideology[which(bg$country == "HONDURAS" & bg$year == "2013" & bg$acronym == "PNH")]
# Mexico 1995, 1996, 1997
df.final$left_gov[which(df.final$country == "Mexico" & df.final$year %in% 
                          c("1995", "1996", "1997"))] <-
  bg$ideology[which(bg$country == "MEXICO" & bg$year == "1994" & bg$acronym == "PRI")]
# Mexico 2000, 2003, 2004, 2005
df.final$left_gov[which(df.final$country == "Mexico" & df.final$year %in% 
                          c("2000", "2003", "2004", "2005"))] <-
  bg$ideology[which(bg$country == "MEXICO" & bg$year == "2000" & bg$acronym == "PAN")]
# Mexico 2006, 2007, 2008, 2009, 2010
df.final$left_gov[which(df.final$country == "Mexico" & df.final$year %in% 
                          c("2006", "2007", "2008", "2009", "2010"))] <-
  bg$ideology[which(bg$country == "MEXICO" & bg$year == "2006" & bg$acronym == "PAN")]
# Nicaragua 2004 
df.final$left_gov[which(df.final$country == "Nicaragua" & df.final$year == "2004")] <-
  bg$ideology[which(bg$country == "NICARAGUA" & bg$year == "2001" & bg$acronym == "PLC")]
# Nicaragua 2006, 2008, 2010 
df.final$left_gov[which(df.final$country == "Nicaragua" & 
                          df.final$year %in% c("2006", "2008", "2010"))] <-
  bg$ideology[which(bg$country == "NICARAGUA" & bg$year == "2006" & bg$acronym == "FSLN")]
# Nicaragua 2012, 2014
df.final$left_gov[which(df.final$country == "Nicaragua" & 
                          df.final$year %in% c("2012", "2014"))] <-
  bg$ideology[which(bg$country == "NICARAGUA" & bg$year == "2011" & bg$acronym == "FSLN")]
# Panama 2004, 2006, 2008
df.final$left_gov[which(df.final$country == "Panama" & 
                          df.final$year %in% c("2004", "2006", "2008"))] <-
  bg$ideology[which(bg$country == "PANAMA" & bg$year == "2004" & bg$acronym == "PRD")]
# Panama 2010, 2012
df.final$left_gov[which(df.final$country == "Panama" & 
                          df.final$year %in% c("2010", "2012"))] <-
  bg$ideology[which(bg$country == "PANAMA" & bg$year == "2009" & bg$acronym == "CD")]
# Panama 2014
df.final$left_gov[which(df.final$country == "Panama" & df.final$year == "2014")] <-
  bg$ideology[which(bg$country == "PANAMA" & bg$year == "2014" & bg$party == "Partido Panamenista")]
# Paraguay 2008, 2010, 2012 
df.final$left_gov[which(df.final$country == "Paraguay" & 
                          df.final$year %in% c("2008", "2010", "2012"))] <-
  bg$ideology[which(bg$country == "PARAGUAY" & bg$year == "2008" & bg$acronym == "APC")]
# Paraguay 2014
df.final$left_gov[which(df.final$country == "Paraguay" & df.final$year == "2014")] <-
  bg$ideology[which(bg$country == "PARAGUAY" & bg$year == "2013" & bg$acronym == "ANR-PC")]
# Peru 1996, 2000
df.final$left_gov[which(df.final$country == "Peru" & 
                          df.final$year %in% c("1996", "2000"))] <-
  bg$ideology[which(bg$country == "PERU" & bg$year == "1995" & bg$acronym == "C-90-NM")]
# Peru 2001
df.final$left_gov[which(df.final$country == "Peru" & df.final$year == "2001")] <-
  bg$ideology[which(bg$country == "PERU" & bg$year == "2001" & bg$party == "Perú Posible")]
# Peru 2006, 2008, 2010
df.final$left_gov[which(df.final$country == "Peru" & 
                          df.final$year %in% c("2006", "2008", "2010"))] <-
  bg$ideology[which(bg$country == "PERU" & bg$year == "2006" & bg$acronym == "PAP")]
# Peru 2011, 2012, 2014
df.final$left_gov[which(df.final$country == "Peru" & 
                          df.final$year %in% c("2011", "2012", "2014"))] <-
  bg$ideology[which(bg$country == "PERU" & bg$year == "2011" & bg$acronym == "GP")]
# Uruguay 1996
df.final$left_gov[which(df.final$country == "Uruguay" & df.final$year == "1996")] <-
  bg$ideology[which(bg$country == "URUGUAY" & bg$year == "1994" & bg$acronym == "PC")]
# Uruguay 2006, 2007, 2008
df.final$left_gov[which(df.final$country == "Uruguay" & 
                          df.final$year %in% c("2006", "2007", "2008"))] <-
  bg$ideology[which(bg$country == "URUGUAY" & bg$year == "2004" & bg$acronym == "EP")]
# Uruguay 2009, 2010, 2011, 2012
df.final$left_gov[which(df.final$country == "Uruguay" & 
                          df.final$year %in% c("2009", "2010", "2011", "2012"))] <-
  bg$ideology[which(bg$country == "URUGUAY" & bg$year == "2009" & bg$acronym == "FA")]
# Uruguay 2014
df.final$left_gov[which(df.final$country == "Uruguay" & df.final$year == "2014")] <-
  bg$ideology[which(bg$country == "URUGUAY" & bg$year == "2014" & bg$acronym == "FA")]
# Venezuela 1996
df.final$left_gov[which(df.final$country == "Venezuela, RB" & df.final$year == "1996")] <-
  bg$ideology[which(bg$country == "VENEZUELA" & bg$year == "1993" & bg$acronym == "CN")]
# Venezuela 2000
df.final$left_gov[which(df.final$country == "Venezuela, RB" & df.final$year == "2000")] <-
  bg$ideology[which(bg$country == "VENEZUELA" & bg$year == "2000" & bg$acronym == "MVR")]

# checkpoint: clean up and save progress
write.csv(df.final, "./final/final-many-to-many.csv", row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","mass","elite",
                           "elite.full","elite.nomeps","df.final"))])

##### STRATIFY ELITE SAMPLES FOR MAIN ANALYSIS #####
# Argentina 1995
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "1995" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "1995" &
                           elite.nomeps$party == "Partidos provinciales")] <-
  "Other"
# Argentina 1999
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "1999" &
                           elite.nomeps$party == "FREPASO")] <- "Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "1999" &
                           elite.nomeps$party == "UCR")] <- "Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "1999" &
                           elite.nomeps$party == "Provinciales")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "1999" &
                           elite.nomeps$party == "Otros")] <- "Other"
# Argentina 2006
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Otros")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PS")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PJ")] <- "FPV PJ"
# Argentina 2008
Sys.setlocale("LC_ALL", "es_ES.UTF-8")
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2008" &
                           grepl("ARI",elite.nomeps$party))] <- "ARI"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2008" &
                           grepl("Concert",elite.nomeps$party))] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2008" &
                           grepl("Victoria",elite.nomeps$party))] <- "FPV PJ"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2008" &
                           grepl("Frente",elite.nomeps$party))] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Partido Socialista")] <- "ARI"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
# Argentina 2010
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "CC")] <- "CC ARI"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "ARI")] <- "CC ARI"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "UCR")] <- "CC ARI"
Sys.setlocale("LC_ALL", "es_ES.UTF-8")
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           grepl("por Santiago",elite.nomeps$party))] <- "Other"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Frente para la Victoria-PJ")] <-
  "FPV PJ"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "GEN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Movimiento Proyecto Sur")] <-
  "PRO"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Nuevo Encuentro Popular y Solidario")] <-
  "FPV PJ"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Partido Socialista")] <-
  "CC ARI"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Peronismo Federal")] <- "PRO"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Peronista")] <- "FPV PJ"
# Argentina 2012
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "CC")] <- "CC ARI"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Frente para la Victoria-PJ")] <-
  "FPV PJ"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "GEN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Nuevo Encuentro Popular y Solidario")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Partido Socialista")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Peronismo Federal")] <- "PF"
elite.nomeps$party[which(elite.nomeps$country == "Argentina" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Peronista")] <- "PF"
# Australia 2007
elite.nomeps$party[which(elite.nomeps$country == "Australia" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Australian Greens")] <- "Other"
# Belgium 2007
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "ECOLO")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "FN")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "spa")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "spirit")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "VlaamsBelang")] <- "VB"
# Belgium 2008
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "ECOLO")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "FN")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "spa")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "spirit")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "VlaamsBelang")] <- "VB"
# Belgium 2009
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "ECOLO")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "FN")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "spa")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "spirit")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "VlaamsBelang")] <- "VB"
# Belgium 2010
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "ECOLO")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "GROEN")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "FN")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Lijst Dedecker")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "spa")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "SPA")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "spirit")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "VlaamsBelang")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "VlaamsBelang VB")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "VLD")] <- "Open VLD"
# Belgium 2011
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "ECOLO")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "GROEN")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "FN")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Lijst Dedecker")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "spa")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "SPA")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "spirit")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "VlaamsBelang")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "VlaamsBelang VB")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "VLD")] <- "Open VLD"
# Belgium 2012
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "ECOLO")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "GROEN")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "FN")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Lijst Dedecker")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "spa")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "SPA")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "spirit")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "VlaamsBelang")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "VlaamsBelang VB")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "VLD")] <- "Open VLD"
# Belgium 2013
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "ECOLO")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "GROEN")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "FN")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "Lijst Dedecker")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "spa")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "SPA")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "spirit")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "VlaamsBelang")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "VlaamsBelang VB")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "VLD")] <- "Open VLD"
# Belgium 2014
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "ECOLO")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "GROEN")] <- "ECOLO Green"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "FN")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Lijst Dedecker")] <- "FN LDD"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "spa")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "SPA")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "spirit")] <- "SPA SPIRIT"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "VlaamsBelang")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "VlaamsBelang VB")] <- "VB"
elite.nomeps$party[which(elite.nomeps$country == "Belgium" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "VLD")] <- "Open VLD"
# Bolivia 2004
elite.nomeps$party[which(elite.nomeps$country == "Bolivia" &
                           elite.nomeps$year == "2004" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Bolivia 2010
elite.nomeps$party[which(elite.nomeps$country == "Bolivia" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Bolivia 2012
elite.nomeps$party[which(elite.nomeps$country == "Bolivia" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Bolivia 2014
elite.nomeps$party[which(elite.nomeps$country == "Bolivia" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Brazil 2006
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PR")] <- "PL"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PSC")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PTC")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PRB")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PMN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PHS")] <- "Other"
# Brazil 2007
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "PR")] <- "PL"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "PSC")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "PTC")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "PRB")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "PMN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "PHS")] <- "Other"
# Brazil 2008
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "PR")] <- "PL"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "PSC")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "PTC")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "PRB")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "PMN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "PHS")] <- "Other"
# Brazil 2010
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PR")] <- "PL"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PSC")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PTC")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PRB")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PMN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PHS")] <- "Other"
# Brazil 2012
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "PMN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "PSOL")] <- "Other"
# Brazil 2014
elite.nomeps$party[which(elite.nomeps$country == "Brazil" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PMN")] <- "Other"
# Chile 1996
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "1996" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Chile 1999
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "1999" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Chile 2000
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2000" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Chile 2005
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2005" &
                           elite.nomeps$party == "PRSD")] <- "Other"
# Chile 2006
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Chile 2008
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Chile 2009
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
# Chile 2010
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Independiente")] <- "Other"
# Chile 2011
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Independiente")] <- "Other"
# Chile 2012
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Independiente")] <- "Other"
# Chile 2014
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Otros partidos")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Chile" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Independiente")] <- "Other"
# Colombia 1998
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "1998" &
                           elite.nomeps$party == "Otros")] <- "Other"
# Colombia 2004
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2004" &
                           elite.nomeps$party == "PL-O")] <- "PL"
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2004" &
                           elite.nomeps$party == "PL-U")] <- "PL"
# Colombia 2005
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2005" &
                           elite.nomeps$party == "PL-O")] <- "PL"
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2005" &
                           elite.nomeps$party == "PL-U")] <- "PL"
# Colombia 2006
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Mov. Equipo Alas Colombia ")] <- "MEAC"
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Mov. Equipo Alas Colombia")] <- "MEAC"
# Colombia 2008
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Mov. Equipo Alas Colombia ")] <- "MEAC"
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Mov. Equipo Alas Colombia")] <- "MEAC"
# Colombia 2014
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2014" &
                           grepl("Ciudad",elite.nomeps$party))] <- "OC"
elite.nomeps$party[which(elite.nomeps$country == "Colombia" &
                           elite.nomeps$year == "2014" &
                           grepl("Alianza",elite.nomeps$party))] <- "AdlV"
# Costa Rica 2004
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2004" &
                           grepl("Costarri",elite.nomeps$party))] <- "RC"
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2004" &
                           grepl("Libert",elite.nomeps$party))] <- "ML"
# Costa Rica 2010
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2010" &
                           grepl("Libert",elite.nomeps$party))] <- "ML"
# Costa Rica 2012
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2012" &
                           grepl("Libert",elite.nomeps$party))] <- "ML"
# Costa Rica 2014
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2014" &
                           grepl("Accesibilidad",elite.nomeps$party))] <- "PASE"
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2014" &
                           grepl("Movimiento",elite.nomeps$party))] <- "ML"
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2014" &
                           grepl("Unidad",elite.nomeps$party))] <- "PUSC"
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2014" &
                           grepl("Costarri",elite.nomeps$party))] <- "RC"
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2014" &
                           grepl("Restaurac",elite.nomeps$party))] <- "RN"
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2014" &
                           grepl("Liberac",elite.nomeps$party))] <- "PLN"
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2014" &
                           grepl("Alianza",elite.nomeps$party))] <- "ADC"
elite.nomeps$party[which(elite.nomeps$country == "Costa Rica" &
                           elite.nomeps$year == "2014" &
                           grepl("Frente",elite.nomeps$party))] <- "FA"
# DR 2010
elite.nomeps$party[which(elite.nomeps$country == "Dominican Republic" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PRSC")] <- "Other"
# DR 2012
elite.nomeps$party[which(elite.nomeps$country == "Dominican Republic" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "PRSC")] <- "Other"
# DR 2014
elite.nomeps$party[which(elite.nomeps$country == "Dominican Republic" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PRSC")] <- "Other"
# Ecuador 2008
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2008" &
                           grepl("MPA",elite.nomeps$party))] <- "PAIS"
# Ecuador 2010
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2010" &
                           grepl("MPA",elite.nomeps$party))] <- "PAIS"
# Ecuador 2012
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2012" &
                           grepl("MPA",elite.nomeps$party))] <- "PAIS"
# Ecuador 2013
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2013" &
                           grepl("Carchi",elite.nomeps$party))] <- "Independent"
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2013" &
                           grepl("Region",elite.nomeps$party))] <- "Independent"
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2013" &
                           grepl("izquier",elite.nomeps$party))] <- "PUL"
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2013" &
                           grepl("Pachakutik",elite.nomeps$party))] <- "PUL"
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2013" &
                           grepl("Alianza",elite.nomeps$party))] <- "PAIS"
# Ecuador 2014
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2014" &
                           grepl("Carchi",elite.nomeps$party))] <- "Independent"
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2014" &
                           grepl("Region",elite.nomeps$party))] <- "Independent"
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2014" &
                           grepl("izquier",elite.nomeps$party))] <- "PUL"
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2014" &
                           grepl("Pachakutik",elite.nomeps$party))] <- "PUL"
elite.nomeps$party[which(elite.nomeps$country == "Ecuador" &
                           elite.nomeps$year == "2014" &
                           grepl("Alianza",elite.nomeps$party))] <- "PAIS"
# El Salvador 2012
elite.nomeps$party[which(elite.nomeps$country == "El Salvador" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "PES")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "El Salvador" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "CD")] <- "Other"
# El Salvador 2014
elite.nomeps$party[which(elite.nomeps$country == "El Salvador" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PES")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "El Salvador" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "CD")] <- "Other"
# Finland 2011
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "National Coalition Pary")] <-
  "National Coalition Party"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Center Party of Finland")] <-
  "Centre Party"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Christian Democrats in Finland")] <-
  "Christian Democrats"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Green League")] <-
  "Greens"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Social Democratic Party")] <-
  "Social Democrats"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Swedish Peoples Party in Finland")] <-
  "Swedish Peoples Party"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "The Left Alliance")] <-
  "Left Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "The Finns Party")] <-
  "True Finns"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Communist Party")] <-
  "Left Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Liberals")] <- "Left Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Other")] <-
  "Swedish Peoples Party" # Aland Coalition caucuses with the SPP
# Finland 2012
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Communist Party")] <-
  "The Left Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Other")] <-
  "Swedish Peoples Party in Finland"
# Finland 2013
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "Communist Party")] <-
  "The Left Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "Other")] <-
  "Swedish Peoples Party in Finland"
# Finland 2014
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Communist Party")] <-
  "The Left Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Other")] <-
  "Swedish Peoples Party in Finland"
# Finland 2015
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "Communist Party")] <-
  "The Left Alliance"
elite.nomeps$party[which(elite.nomeps$country == "Finland" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "Other")] <-
  "Swedish Peoples Party in Finland"
# France 1967
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "1967" &
                           elite.nomeps$party == "MRP")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "1967" &
                           elite.nomeps$party == "PDM")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "1967" &
                           elite.nomeps$party == "Radical")] <- "FGDS"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "1967" &
                           elite.nomeps$party == "Socialist Party")] <- "FGDS"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "1967" &
                           elite.nomeps$party == "UNR")] <- "Fifth Republic"
# France 2007 - set MPF to UPM or it gets zero weight
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Modem")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "MPF")] <- "UMP"
# France 2008 - set MPF to UPM or it gets zero weight
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Modem")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "MPF")] <- "UMP"
# France 2009 - set MPF to UPM or it gets zero weight
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Modem")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "MPF")] <- "UMP"
# France 2010 - set MPF to UPM or it gets zero weight
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Modem")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "MPF")] <- "UMP"
# France 2011 - set MPF to UPM or it gets zero weight
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Modem")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "MPF")] <- "UMP"
# France 2012 - set MPF to UPM or it gets zero weight
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Modem")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "France" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "MPF")] <- "UMP"
# Germany 2005 -- WASG ran on PDS list
elite.nomeps$party[which(elite.nomeps$country == "Germany" &
                           elite.nomeps$year == "2005" &
                           elite.nomeps$party == "WASG")] <- "Linke PDS"
# Germany 2006 -- WASG ran on PDS list
elite.nomeps$party[which(elite.nomeps$country == "Germany" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "WASG")] <- "Linke PDS"
# Germany 2007 -- WASG ran on PDS list
elite.nomeps$party[which(elite.nomeps$country == "Germany" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "WASG")] <- "Linke PDS"
# Germany 2008 -- WASG ran on PDS list
elite.nomeps$party[which(elite.nomeps$country == "Germany" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "WASG")] <- "Linke PDS"
# Germany 2009 -- WASG and Linke PDS merge into Die Linke
elite.nomeps$party[which(elite.nomeps$country == "Germany" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "WASG")] <- "Die Linke"
elite.nomeps$party[which(elite.nomeps$country == "Germany" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Linke PDS")] <- "Die Linke"
# Guatemala 2004
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2004" &
                           elite.nomeps$party == "Integracionista")] <- "Otros"
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2004" &
                           elite.nomeps$party == "Patriota")] <- "Otros"
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2004" &
                           elite.nomeps$party == "PSN")] <- "Otros"
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2004" &
                           elite.nomeps$party == "Otros ")] <- "Otros"
# Guatemala 2006
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Integracionista")] <- "Otros"
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Patriota")] <- "Otros"
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "PSN")] <- "Otros"
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Otros ")] <- "Otros"
# Guatemala 2008
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "BG")] <- "Otros partidos"
# Guatemala 2010
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "BG")] <- "Otros partidos"
# Guatemala 2012
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "UNE")] <- "UNE-GANA"
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "GANA")] <- "UNE-GANA"
# Guatemala 2014
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "UNE")] <- "UNE-GANA"
elite.nomeps$party[which(elite.nomeps$country == "Guatemala" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "GANA")] <- "UNE-GANA"
# Honduras 2014
elite.nomeps$party[which(elite.nomeps$country == "Honduras" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PINU")] <- "Other"
# Hungary 2006
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Hungary 2007
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Hungary 2008
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Hungary 2009
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Hungary 2010 -- too many "other" and "no" to place their weights
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "No party")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Other")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Hungary 2011
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "No party")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Other")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Hungary 2012
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "No party")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Other")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Hungary 2013
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "No party")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "Other")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Hungary 2014
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "No party")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Other")] <- NA
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Fidesz")] <- "Fidesz-KDNP"
elite.nomeps$party[which(elite.nomeps$country == "Hungary" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "KDNP")] <- "Fidesz-KDNP"
# Ireland 2007
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2007" &
                           grepl("FIANNA",elite.nomeps$party))] <- "FF"
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2007" &
                           grepl("SINN",elite.nomeps$party))] <- "SF"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Ireland 2008
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2008" &
                           grepl("FIANNA",elite.nomeps$party))] <- "FF"
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2008" &
                           grepl("SINN",elite.nomeps$party))] <- "SF"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Ireland 2009
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2009" &
                           grepl("FIANNA",elite.nomeps$party))] <- "FF"
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2009" &
                           grepl("SINN",elite.nomeps$party))] <- "SF"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Ireland 2010
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2010" &
                           grepl("FIANNA",elite.nomeps$party))] <- "FF"
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2010" &
                           grepl("SINN",elite.nomeps$party))] <- "SF"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Ireland 2011
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2011" &
                           grepl("FIANNA",elite.nomeps$party))] <- "FF"
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2011" &
                           grepl("SINN",elite.nomeps$party))] <- "SF"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Ireland 2012
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2012" &
                           grepl("FIANNA",elite.nomeps$party))] <- "FF"
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2012" &
                           grepl("SINN",elite.nomeps$party))] <- "SF"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
elite.nomeps$party[which(elite.nomeps$country == "Ireland" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Italy 2008
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2008" &
                           grepl("Futuro",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2008" &
                           grepl("Popolo",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2008" &
                           grepl("REPUBBLICANI",elite.nomeps$party))] <-
  "Unione Democratici Cristiani"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2008" &
                           grepl("Alleanza",elite.nomeps$party))] <-
  "Partito Democratico"
# Italy 2009
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2009" &
                           grepl("Futuro",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2009" &
                           grepl("Popolo",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2009" &
                           grepl("REPUBBLICANI",elite.nomeps$party))] <-
  "Unione Democratici Cristiani"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2009" &
                           grepl("Alleanza",elite.nomeps$party))] <-
  "Partito Democratico"
# Italy 2010
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2010" &
                           grepl("Futuro",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2010" &
                           grepl("Popolo",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2010" &
                           grepl("REPUBBLICANI",elite.nomeps$party))] <-
  "Unione Democratici Cristiani"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2010" &
                           grepl("Alleanza",elite.nomeps$party))] <-
  "Partito Democratico"
# Italy 2011
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2011" &
                           grepl("Futuro",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2011" &
                           grepl("Popolo",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2011" &
                           grepl("REPUBBLICANI",elite.nomeps$party))] <-
  "Unione Democratici Cristiani"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2011" &
                           grepl("Alleanza",elite.nomeps$party))] <-
  "Partito Democratico"
# Italy 2012
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2012" &
                           grepl("Futuro",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2012" &
                           grepl("Popolo",elite.nomeps$party))] <- "PdL"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2012" &
                           grepl("REPUBBLICANI",elite.nomeps$party))] <-
  "Unione Democratici Cristiani"
elite.nomeps$party[which(elite.nomeps$country == "Italy" &
                           elite.nomeps$year == "2012" &
                           grepl("Alleanza",elite.nomeps$party))] <-
  "Partito Democratico"
# Japan 2013
elite.nomeps$party[which(elite.nomeps$country == "Japan" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "Independent")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Japan" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "New Renaissance Party")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "Japan" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "People's Life Party")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "Japan" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "New Party Daichi")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "Japan" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "People's New Party")] <-
  "Other"
# Netherlands 2006
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Groen Links")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "SGP")] <- "Other"
# Netherlands 2007
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Groen Links")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "SGP")] <- "Other"
# Netherlands 2008
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Groen Links")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "SGP")] <- "Other"
# Netherlands 2009
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Groen Links")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "SGP")] <- "Other"
# Netherlands 2010
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "CU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PvdD")] <- "Other"
# Netherlands 2011
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "CU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Netherlands" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "PvdD")] <- "Other"
# Nicaragua 2008 -- no info for independents so NA out
elite.nomeps$party[which(elite.nomeps$country == "Nicaragua" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Independiente")] <- NA
# Nicaragua 2010 -- no info for independents so NA out
elite.nomeps$party[which(elite.nomeps$country == "Nicaragua" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Independiente")] <- NA
# Norway 2005
elite.nomeps$party[which(elite.nomeps$country == "Norway" &
                           elite.nomeps$year == "2005" &
                           grepl("yre",elite.nomeps$party))] <- "Hoyre"
# Norway 2007
elite.nomeps$party[which(elite.nomeps$country == "Norway" &
                           elite.nomeps$year == "2007" &
                           grepl("yre",elite.nomeps$party))] <- "Hoyre"
# Norway 2009
elite.nomeps$party[which(elite.nomeps$country == "Norway" &
                           elite.nomeps$year == "2009" &
                           grepl("yre",elite.nomeps$party))] <- "Hoyre"
# Panama 2014
elite.nomeps$party[which(elite.nomeps$country == "Panama" &
                           elite.nomeps$year == "2014" &
                           grepl("CAMBIO",elite.nomeps$party))] <- "CD"
elite.nomeps$party[which(elite.nomeps$country == "Panama" &
                           elite.nomeps$year == "2014" &
                           grepl("PANAM",elite.nomeps$party))] <- "Pista"
elite.nomeps$party[which(elite.nomeps$country == "Panama" &
                           elite.nomeps$year == "2014" &
                           grepl("INDE",elite.nomeps$party))] <- "Other"
# Paraguay 2014
elite.nomeps$party[which(elite.nomeps$country == "Paraguay" &
                           elite.nomeps$year == "2014" &
                           grepl("AVANZA",elite.nomeps$party))] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Paraguay" &
                           elite.nomeps$year == "2014" &
                           grepl("FRENTE",elite.nomeps$party))] <- "FG"
elite.nomeps$party[which(elite.nomeps$country == "Paraguay" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PEN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Paraguay" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PPQ")] <- "Other"
# Peru 2001
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2001" &
                           grepl("Posible",elite.nomeps$party))] <- "PP"
# Peru 2008 - no idea what PNP is, doesn't fit PELA documentation
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "PNP")] <- NA
# Peru 2010 - no idea what PNP is, doesn't fit PELA documentation
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PNP")] <- NA
# Peru 2011
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2011" &
                           grepl("Posible",elite.nomeps$party))] <- "PP"
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2011" &
                           grepl("Gana",elite.nomeps$party))] <- "Gana"
# Peru 2012
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2012" &
                           grepl("Posible",elite.nomeps$party))] <- "PP"
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2012" &
                           grepl("Gana",elite.nomeps$party))] <- "Gana"
# Peru 2013
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2013" &
                           grepl("Posible",elite.nomeps$party))] <- "PP"
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2013" &
                           grepl("Gana",elite.nomeps$party))] <- "Gana"
# Peru 2014
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2014" &
                           grepl("Posible",elite.nomeps$party))] <- "PP"
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2014" &
                           grepl("Gana",elite.nomeps$party))] <- "Gana"
# Peru 2015
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2015" &
                           grepl("Posible",elite.nomeps$party))] <- "PP"
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2015" &
                           grepl("Gana",elite.nomeps$party))] <- "Gana"
# Peru 2016
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2016" &
                           grepl("Posible",elite.nomeps$party))] <- "PP"
elite.nomeps$party[which(elite.nomeps$country == "Peru" &
                           elite.nomeps$year == "2016" &
                           grepl("Gana",elite.nomeps$party))] <- "Gana"
# Poland 2007
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Piast")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "SD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "SDPL")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "SLD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "UP")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "MN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Poland 2008
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Piast")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "SD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "SDPL")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "SLD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "UP")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "MN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Poland 2009
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Piast")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "SD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "SDPL")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "SLD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "UP")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "MN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Poland 2010
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Piast")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "SD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "SDPL")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "SLD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "UP")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "MN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Poland 2011
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Piast")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "SD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "SDPL")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "SLD")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "UP")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "MN")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Poland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "INDEPENDENT")] <- "Other"
# Portugal 2009
elite.nomeps$party[which(elite.nomeps$country == "Portugal" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "CDS/PP")] <- "CDS-PP"
# Portugal 2010
elite.nomeps$party[which(elite.nomeps$country == "Portugal" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "CDS/PP")] <- "CDS-PP"
# Romania 2012 -- all party coalitions and alliances
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "FC")] <- "ARD"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "PDL")] <- "ARD"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "PC")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "PNL")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "PSD")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "UNPR")] <- "USL"
# Romania 2013
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "FC")] <- "ARD"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "PDL")] <- "ARD"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "PC")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "PNL")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "PSD")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "UNPR")] <- "USL"
# Romania 2014
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "FC")] <- "ARD"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PDL")] <- "ARD"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PC")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PNL")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "PSD")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "UNPR")] <- "USL"
# Romania 2015
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "FC")] <- "ARD"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "PDL")] <- "ARD"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "PC")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "PNL")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "PSD")] <- "USL"
elite.nomeps$party[which(elite.nomeps$country == "Romania" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "UNPR")] <- "USL"
# Spain 2008
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "BNG")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "CIU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "IU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "PNV")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "UPN")] <- "Other"
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2008" &
                           grepl("Canaria",elite.nomeps$party))] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2008" &
                           grepl("Verds",elite.nomeps$party))] <- "Other"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
# Spain 2009
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "BNG")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "CIU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "IU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "PNV")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "UPN")] <- "Other"
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2009" &
                           grepl("Canaria",elite.nomeps$party))] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2009" &
                           grepl("Verds",elite.nomeps$party))] <- "Other"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
# Spain 2010
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "BNG")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "CIU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "IU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "PNV")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "UPN")] <- "Other"
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2010" &
                           grepl("Canaria",elite.nomeps$party))] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2010" &
                           grepl("Verds",elite.nomeps$party))] <- "Other"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
# Spain 2011
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "BNG")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "CIU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "IU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "PNV")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "UPN")] <- "Other"
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2011" &
                           grepl("Canaria",elite.nomeps$party))] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2011" &
                           grepl("Verds",elite.nomeps$party))] <- "Other"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
# Spain 2012
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "BNG")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "CIU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "IU")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "UPN")] <- "Other"
Sys.setlocale(locale="C")
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2012" &
                           grepl("Canaria",elite.nomeps$party))] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Spain" &
                           elite.nomeps$year == "2012" &
                           grepl("Verds",elite.nomeps$party))] <- "Other"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
# Switzerland 2011
elite.nomeps$party[which(elite.nomeps$country == "Switzerland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "SP PS")] <- "SPS PSS"
elite.nomeps$party[which(elite.nomeps$country == "Switzerland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "EVP PEP")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Switzerland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "MCG")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Switzerland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "EVP PEV")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Switzerland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Lega")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Switzerland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "EDU UDF")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Switzerland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "LPS PLS")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "Switzerland" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Pda PST")] <- "FDP PRD"
# UK 1987
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1987" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1987" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1988
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1988" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1988" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1989
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1989" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1989" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1990
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1990" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1990" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1991
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1991" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1991" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1992
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1992" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1992" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1993
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1993" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1994
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1994" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1995
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1995" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1996
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1996" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1997
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1997" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 1998
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1998" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
# UK 1999
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "1999" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
# UK 2000
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2000" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
# UK 2001
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2001" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
# UK 2005
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2005" &
                           elite.nomeps$party == "Scottish National Party")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2005" &
                           elite.nomeps$party == "Conservative Party")] <-
  "Conservative"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2005" &
                           elite.nomeps$party == "Labour Party")] <- "Labour"
# UK 2006
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Scottish National Party")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Conservative Party")] <-
  "Conservative"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Labour Party")] <- "Labour"
# UK 2007
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Scottish National Party")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Conservative Party")] <-
  "Conservative"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Labour Party")] <- "Labour"
# UK 2008
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Scottish National Party")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Conservative Party")] <-
  "Conservative"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Labour Party")] <- "Labour"
# UK 2009
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Scottish National Party")] <-
  "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Conservative Party")] <-
  "Conservative"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Labour Party")] <- "Labour"
# UK 2010
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 2011
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 2012
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 2013
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2013" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 2014
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# UK 2015
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "Plaid Cymru")] <- "Other"
elite.nomeps$party[which(elite.nomeps$country == "United Kingdom" &
                           elite.nomeps$year == "2015" &
                           elite.nomeps$party == "Scottish National Party")] <- "Other"
# Uruguay 2006
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "Partido Independiente")] <-
  "PI"
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2006" &
                           elite.nomeps$party == "EP/FA/NM")] <- "EP-FA-NM"
# Uruguay 2007
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "Partido Independiente")] <-
  "PI"
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2007" &
                           elite.nomeps$party == "EP/FA/NM")] <- "EP-FA-NM"
# Uruguay 2008
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "Partido Independiente")] <-
  "PI"
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2008" &
                           elite.nomeps$party == "EP/FA/NM")] <- "EP-FA-NM"
# Uruguay 2009
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "Partido Independiente")] <-
  "PI"
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2009" &
                           elite.nomeps$party == "EP/FA/NM")] <- "EP-FA-NM"
# Uruguay 2010
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2010" &
                           elite.nomeps$party == "Partido Independiente")] <-
  "PI"
# Uruguay 2011
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2011" &
                           elite.nomeps$party == "Partido Independiente")] <-
  "PI"
# Uruguay 2012
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2012" &
                           elite.nomeps$party == "Partido Independiente")] <-
  "PI"
# Uruguay 2014
elite.nomeps$party[which(elite.nomeps$country == "Uruguay" &
                           elite.nomeps$year == "2014" &
                           elite.nomeps$party == "Partido Independiente")] <-
  "PI"

# save the cleaned version
write.csv(elite.nomeps,"./final/final-elite-weighted-nomeps.csv", row.names = F)

### Now we need to do the weighted EMDs
p <- read.csv("./partyweights/partyweights.csv", stringsAsFactors = F)

elist <- vector("list",nrow(df.final))
for(i in 1:length(elist)){
  elist[[i]] <- elite.nomeps[which(elite.nomeps$country == df.final$country[i] &
                                     elite.nomeps$year == df.final$year[i]),]
}
for(i in 1:length(elist)){
  ### work in a temporary environment
  tmp <- elist[[i]]
  
  ### if no ideology answers (looking at you, CCS) then skip it
  if(nrow(tmp[which(!(is.na(tmp$ideology))),]) == 0){
    tmp$w <- tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp[which(!(is.na(tmp$ideology))),] # i.e., delete it
    rm(tmp)
    next
  }
  
  ### if no party or sex info, just weight everything evenly
  if(all(is.na(unique(tmp$party))) & all(is.na(unique(tmp$sex)))){
    tmp <- tmp[!is.na(tmp$ideology_scaled),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(tmp)
    next
  }
  
  ### if just no party info, weight on sex only
  if(all(is.na(unique(tmp$party)))){
    tmp <- tmp[which(!(is.na(tmp$ideology_scaled)) & !(is.na(tmp$sex))),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    ww <- df.final$pc_women_elite_real[i]
    mw <- 1 - ww
    men <- tmp[which(tmp$sex == "Male"),]
    women <- tmp[which(tmp$sex == "Female"),]
    men$w <- (mw/sum(men$w))*men$w
    women$w <- (ww/sum(women$w))*women$w
    tmp <- rbind(women,men)
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(ww,mw,men,women,tmp)
    next
  }
  
  ### if no sex info, weight on party only
  if(all(is.na(unique(tmp$sex)))){
    tmp <- tmp[which(!(is.na(tmp$ideology_scaled)) & !(is.na(tmp$party))),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    ps <- sort(unique(tmp$party))
    for(j in 1:length(ps)){
      pw <- p$weight[which(p$country == df.final$country[i] &
                             p$year == df.final$year[i] &
                             p$party == ps[j])]
      stopifnot(!(is.na(pw)))
      tmp$w[which(tmp$party == ps[j])] <-
        (pw/sum(tmp$w[which(tmp$party ==
                              ps[j])]))*tmp$w[which(tmp$party == ps[j])]
    }
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(pw,ps,tmp)
    next
  }
  
  ### if both, weight on both
  tmp <- tmp[which(!(is.na(tmp$ideology_scaled)) &
                     !(is.na(tmp$sex)) & !(is.na(tmp$party))),]
  tmp$w <- rep(1/nrow(tmp),nrow(tmp))
  
  # get the gender weights
  ww <- df.final$pc_women_elite_real[i]
  mw <- 1 - ww
  # break it into party/sex binaries, then weight and recombine
  obslist <- split(tmp, f=list(tmp$sex,tmp$party))
  for(j in 1:length(obslist)){
    pw <- p$weight[which(p$country == df.final$country[i] &
                           p$year == df.final$year[i] &
                           p$party == unique(obslist[[j]]$party))]
    sw <- ifelse(unique(obslist[[j]]$sex) == "Male",mw,ww)
    W <- pw*sw
    obslist[[j]]$w <- (W/sum(obslist[[j]]$w))*obslist[[j]]$w
    rm(pw,sw,W) # so no accidental duplicates
  }
  tmp <- do.call(rbind,obslist)
  stopifnot(abs(1 - sum(tmp$w)) < .1)
  # sometimes parties only have 1 gender, in which case this is slightly off
  # not much we can do, but the size of these errors are small.
  # reweight in such cases -- always women need to be upweighted
  if(all.equal(sum(tmp$w),1) != T){
    diff <- 1-sum(tmp$w)
    tmp$w[which(tmp$sex == "Female")] <- (tmp$w[which(tmp$sex == "Female")] +
                                            diff/nrow(tmp[which(tmp$sex == "Female"),]))
    tmp$w <- tmp$w*(1/sum(tmp$w))
    stopifnot(abs(ww - sum(tmp$w[which(tmp$sex == "Female")])) < .05)
    rm(diff)
  }
  stopifnot(all.equal(sum(tmp$w),1))
  tmp$w_d <- tmp$w
  elist[[i]] <- tmp
  rm(mw,ww,tmp,obslist)
}
elite.weighted <- do.call(rbind, elist)
rm(elist,p)

# save the cleaned version
write.csv(elite.weighted,"./final/final-elite-weighted-nomeps.csv",
          row.names = F)

### run the loop on weighted data
df.final$mscale_weighted <- df.final$escale_weighted <-
  df.final$esurvey_weighted <- df.final$emd_all_weighted <-
  df.final$emd_lessaffluent_weighted <-
  df.final$emd_midloaffluent_weighted <-
  df.final$emd_midaffluent_weighted <-
  df.final$emd_midhiaffluent_weighted <-
  df.final$emd_moreaffluent_weighted <-
  df.final$nobs_elite_weighted <- df.final$emean_weighted <-
  df.final$evar_weighted <-
  df.final$emd_lessaffluent_hiinfo_weighted <-
  df.final$emd_lessaffluent_loinfo_weighted <-
  df.final$emd_midloaffluent_hiinfo_weighted <-
  df.final$emd_midloaffluent_loinfo_weighted <-
  df.final$emd_midaffluent_hiinfo_weighted <-
  df.final$emd_midaffluent_loinfo_weighted <-
  df.final$emd_midhiaffluent_hiinfo_weighted <-
  df.final$emd_midhiaffluent_loinfo_weighted <-
  df.final$emd_moreaffluent_hiinfo_weighted <-
  df.final$emd_moreaffluent_loinfo_weighted <-
  df.final$mscale_weighted_d <- df.final$escale_weighted_d <-
  df.final$esurvey_weighted_d <- df.final$emd_all_weighted_d <-
  df.final$emd_lessaffluent_weighted_d <-
  df.final$emd_midloaffluent_weighted_d <-
  df.final$emd_midaffluent_weighted_d <-
  df.final$emd_midhiaffluent_weighted_d <-
  df.final$emd_moreaffluent_weighted_d <-
  df.final$nobs_elite_weighted_d <- df.final$emean_weighted_d <-
  df.final$evar_weighted_d <-
  df.final$emd_lessaffluent_hiinfo_weighted_d <-
  df.final$emd_lessaffluent_loinfo_weighted_d <-
  df.final$emd_midloaffluent_hiinfo_weighted_d <-
  df.final$emd_midloaffluent_loinfo_weighted_d <-
  df.final$emd_midaffluent_hiinfo_weighted_d <-
  df.final$emd_midaffluent_loinfo_weighted_d <-
  df.final$emd_midhiaffluent_hiinfo_weighted_d <-
  df.final$emd_midhiaffluent_loinfo_weighted_d <-
  df.final$emd_moreaffluent_hiinfo_weighted_d <-
  df.final$emd_moreaffluent_loinfo_weighted_d <-
  rep(NA,nrow(df.final))
# set up temp storage
mlist <- as.list(NULL)
elist <- as.list(NULL)
# break out the data -- takes a minute
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mlist[[i]] <- mass[which(mass$country == df.final$country[i] &
                             mass$year == df.final$year[i]),]
  elist[[i]] <- elite.weighted[which(elite.weighted$country ==
                                       df.final$country[i] &
                                       elite.weighted$year == df.final$year[i]),]
  if(nrow(elist[[i]]) > 0){
    df.final$esurvey_weighted[i] <-
      paste0(unique(elist[[i]]$survey),collapse=", ")
  }
  setTxtProgressBar(pb, i)
}
# run it
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  
  if(nrow(edata[which(!is.na(edata$ideology)),]) == 0 |
     nrow(mdata[which(!is.na(mdata$ideology)),]) == 0){
    next
  } else {
    # create a temporary data frame of class measures, ranked by our preference
    class <- data.frame(cbind(mdata$country, mdata$wealth, mdata$income,
                              mdata$occupation, mdata$education))
    colnames(class) <- c("country","wealth","income","occupation","education")
    # delete all NAs
    for(j in ncol(class):2){
      if(all(is.na(unique(class[,j]))) == T){
        class <- class[,-j]
      }
    }
    # make sure the loop uses our best class measure
    whichclass <- colnames(class)[2] # since 1 is country
    colnames(mdata)[grep(whichclass, names(mdata))] <- "class"
    
    # create temporary data for knowledge
    info <- data.frame(cbind(mdata$country, mdata$knowledge, mdata$education))
    colnames(info) <- c("country","knowledge","education")
    for(j in ncol(info):2){
      if(all(is.na(unique(info[,j]))) == T){
        info <- info[,-j]
      }
    }
    # make sure the loop uses our best knowledge measure
    whichinfo <- colnames(info)[2]
    colnames(mdata)[grep(whichinfo, names(mdata))] <- "know"
    
    ### Elite
    y <- as.matrix(na.omit(edata$ideology_scaled))
    w.y <- as.matrix(na.omit(edata$w))
    stopifnot(all.equal(sum(w.y),1))
    
    ### All mass
    x <- as.matrix(na.omit(mdata$ideology_scaled))
    w.x <- as.matrix(rep(1/nrow(data.frame(na.omit(mdata$ideology_scaled))),
                         nrow(data.frame(na.omit(mdata$ideology_scaled)))))
    
    ### Less affluent mass
    x.lessaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0")])))))
    
    ### Midlo affluent mass
    x.midloaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1")]))
    w.x.midloaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1")])))))
    
    ### Mid affluent mass
    x.midaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2")])))))
    
    ### Midhi affluent mass
    x.midhiaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3")]))
    w.x.midhiaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3")])))))
    
    ### More affluent mass
    x.moreaffluent <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4")])))))
    
    ### Less affluent mass, high info
    x.lessaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "4")]))
    w.x.lessaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0" &
                                      mdata$know == "4")])))))
    
    ### Midlo affluent mass, high info
    x.midloaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "4")]))
    w.x.midloaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1" &
                                      mdata$know == "4")])))))
    
    ### Mid affluent mass, high info
    x.midaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "4")]))
    w.x.midaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2" &
                                      mdata$know == "4")])))))
    
    ### Midhi affluent mass, high info
    x.midhiaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "4")]))
    w.x.midhiaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3" &
                                      mdata$know == "4")])))))
    
    ### More affluent mass, high info
    x.moreaffluent.hi <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "4")]))
    w.x.moreaffluent.hi <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4" &
                                      mdata$know == "4")])))))
    
    ### Less affluent mass, low info
    x.lessaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "0")]))
    w.x.lessaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "0" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "0" &
                                      mdata$know == "0")])))))
    
    ### Midlo affluent mass, low info
    x.midloaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "0")]))
    w.x.midloaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "1" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "1" &
                                      mdata$know == "0")])))))
    
    ### Mid affluent mass, low info
    x.midaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "0")]))
    w.x.midaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "2" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "2" &
                                      mdata$know == "0")])))))
    
    ### Midhi affluent mass, low info
    x.midhiaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "0")]))
    w.x.midhiaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "3" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "3" &
                                      mdata$know == "0")])))))
    
    ### More affluent mass, low info
    x.moreaffluent.lo <- as.matrix(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "0")]))
    w.x.moreaffluent.lo <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$ideology_scaled[which(mdata$class == "4" & mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$ideology_scaled[which(mdata$class == "4" &
                                      mdata$know == "0")])))))
    
    # ensure everything is going correctly
    if(length(x.lessaffluent) == 0 | length(x.moreaffluent) == 0 |
       length(y) == 0){
      warning("check the if-length condition!")
      next
    } else {
      # some defensive programming checks
      stopifnot(all.equal(sum(w.x), 1))
      stopifnot(all.equal(sum(w.y), 1))
      if(length(x.lessaffluent) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent), 1))
      }
      if(length(x.midloaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent), 1))
      }
      if(length(x.midaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent), 1))
      }
      if(length(x.midhiaffluent) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent), 1))
      }
      if(length(x.moreaffluent) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent), 1))
      }
      if(length(x.lessaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent.lo), 1))
      }
      if(length(x.lessaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.lessaffluent.hi), 1))
      }
      if(length(x.midloaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent.lo), 1))
      }
      if(length(x.midloaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midloaffluent.hi), 1))
      }
      if(length(x.midaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent.lo), 1))
      }
      if(length(x.midaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midaffluent.hi), 1))
      }
      if(length(x.midhiaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.lo), 1))
      }
      if(length(x.midhiaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.midhiaffluent.hi), 1))
      }
      if(length(x.moreaffluent.lo) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent.lo), 1))
      }
      if(length(x.moreaffluent.hi) > 0){
        stopifnot(all.equal(sum(w.x.moreaffluent.hi), 1))
      }
      
      # EMD for all
      df.final$emd_all_weighted[which(df.final$country ==
                                        unique(mdata$country) &
                                        df.final$year == unique(mdata$year))] <-
        emdw(x,w.x,y,w.y,max.iter = 100000)
      
      # EMD by affluence
      df.final$emd_lessaffluent_weighted[which(df.final$country ==
                                                 unique(mdata$country) &
                                                 df.final$year ==
                                                 unique(mdata$year))] <-
        emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midloaffluent_weighted[which(df.final$country ==
                                                  unique(mdata$country) &
                                                  df.final$year ==
                                                  unique(mdata$year))] <-
        emdw(x.midloaffluent,w.x.midloaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midaffluent_weighted[which(df.final$country ==
                                                unique(mdata$country) &
                                                df.final$year ==
                                                unique(mdata$year))] <-
        emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_midhiaffluent_weighted[which(df.final$country ==
                                                  unique(mdata$country) &
                                                  df.final$year ==
                                                  unique(mdata$year))] <-
        emdw(x.midhiaffluent,w.x.midhiaffluent,y,w.y,max.iter = 100000)
      
      df.final$emd_moreaffluent_weighted[which(df.final$country ==
                                                 unique(mdata$country) &
                                                 df.final$year ==
                                                 unique(mdata$year))] <-
        emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
      
      # EMD, less affluent, by political knowledge
      df.final$emd_lessaffluent_hiinfo_weighted[which(df.final$country ==
                                                        unique(mdata$country) &
                                                        df.final$year ==
                                                        unique(mdata$year))] <-
        emdw(x.lessaffluent.hi,w.x.lessaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_lessaffluent_loinfo_weighted[which(df.final$country ==
                                                        unique(mdata$country) &
                                                        df.final$year ==
                                                        unique(mdata$year))] <-
        emdw(x.lessaffluent.lo,w.x.lessaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, midlo affluent, by political knowledge
      df.final$emd_midloaffluent_hiinfo_weighted[which(df.final$country ==
                                                         unique(mdata$country) &
                                                         df.final$year ==
                                                         unique(mdata$year))] <-
        emdw(x.midloaffluent.hi,w.x.midloaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midloaffluent_loinfo_weighted[which(df.final$country ==
                                                         unique(mdata$country) &
                                                         df.final$year ==
                                                         unique(mdata$year))] <-
        emdw(x.midloaffluent.lo,w.x.midloaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, mid affluent, by political knowledge
      df.final$emd_midaffluent_hiinfo_weighted[which(df.final$country ==
                                                       unique(mdata$country) &
                                                       df.final$year ==
                                                       unique(mdata$year))] <-
        emdw(x.midaffluent.hi,w.x.midaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midaffluent_loinfo_weighted[which(df.final$country ==
                                                       unique(mdata$country) &
                                                       df.final$year ==
                                                       unique(mdata$year))] <-
        emdw(x.midaffluent.lo,w.x.midaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, midhi affluent, by political knowledge
      df.final$emd_midhiaffluent_hiinfo_weighted[which(df.final$country ==
                                                         unique(mdata$country) &
                                                         df.final$year ==
                                                         unique(mdata$year))] <-
        emdw(x.midhiaffluent.hi,w.x.midhiaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_midhiaffluent_loinfo_weighted[which(df.final$country ==
                                                         unique(mdata$country) &
                                                         df.final$year ==
                                                         unique(mdata$year))] <-
        emdw(x.midhiaffluent.lo,w.x.midhiaffluent.lo,y,w.y,max.iter = 100000)
      
      # EMD, more affluent, by political knowledge
      df.final$emd_moreaffluent_hiinfo_weighted[which(df.final$country ==
                                                        unique(mdata$country) &
                                                        df.final$year ==
                                                        unique(mdata$year))] <-
        emdw(x.moreaffluent.hi,w.x.moreaffluent.hi,y,w.y,max.iter = 100000)
      
      df.final$emd_moreaffluent_loinfo_weighted[which(df.final$country ==
                                                        unique(mdata$country) &
                                                        df.final$year ==
                                                        unique(mdata$year))] <-
        emdw(x.moreaffluent.lo,w.x.moreaffluent.lo,y,w.y,max.iter = 100000)
      
      ### store mean and variance
      df.final$evar_weighted[which(df.final$country == unique(mdata$country) &
                                     df.final$year == unique(mdata$year))] <-
        sd(y)^2
      df.final$emean_weighted[which(df.final$country == unique(mdata$country) &
                                      df.final$year == unique(mdata$year))] <-
        sum((y*w.y)) # carry weights through
      
      ### Store meta-data
      df.final$nobs_elite_weighted[which(df.final$country ==
                                           unique(mdata$country) &
                                           df.final$year == unique(mdata$year))] <-
        length(y)
      df.final$mscale_weighted[which(df.final$country ==
                                       unique(mdata$country) &
                                       df.final$year == unique(mdata$year))] <-
        paste(unique(mdata$ideology_scale_orig),collapse=", ")
      df.final$escale_weighted[which(df.final$country ==
                                       unique(mdata$country) &
                                       df.final$year == unique(mdata$year))] <-
        paste(unique(edata$ideology_scale_orig),collapse=", ")
      
      
      if(all(is.na(unique(edata$w_d)))){
        next
      } else {
        
        # get the other weights
        y <- as.matrix(na.omit(edata$ideology_scaled))
        w.y <- as.matrix(na.omit(edata$w_d))
        stopifnot(all.equal(sum(w.y),1))
        
        # EMD for all
        df.final$emd_all_weighted_d[which(df.final$country ==
                                            unique(mdata$country) &
                                            df.final$year == unique(mdata$year))] <-
          emdw(x,w.x,y,w.y,max.iter = 100000)
        
        # EMD by affluence
        df.final$emd_lessaffluent_weighted_d[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year ==
                                                     unique(mdata$year))] <-
          emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
        
        df.final$emd_midloaffluent_weighted_d[which(df.final$country ==
                                                      unique(mdata$country) &
                                                      df.final$year ==
                                                      unique(mdata$year))] <-
          emdw(x.midloaffluent,w.x.midloaffluent,y,w.y,max.iter = 100000)
        
        df.final$emd_midaffluent_weighted_d[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year ==
                                                    unique(mdata$year))] <-
          emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
        
        df.final$emd_midhiaffluent_weighted_d[which(df.final$country ==
                                                      unique(mdata$country) &
                                                      df.final$year ==
                                                      unique(mdata$year))] <-
          emdw(x.midhiaffluent,w.x.midhiaffluent,y,w.y,max.iter = 100000)
        
        df.final$emd_moreaffluent_weighted_d[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year ==
                                                     unique(mdata$year))] <-
          emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
        
        # EMD, less affluent, by political knowledge
        df.final$emd_lessaffluent_hiinfo_weighted_d[which(df.final$country ==
                                                            unique(mdata$country) &
                                                            df.final$year ==
                                                            unique(mdata$year))] <-
          emdw(x.lessaffluent.hi,w.x.lessaffluent.hi,y,w.y,max.iter = 100000)
        
        df.final$emd_lessaffluent_loinfo_weighted_d[which(df.final$country ==
                                                            unique(mdata$country) &
                                                            df.final$year ==
                                                            unique(mdata$year))] <-
          emdw(x.lessaffluent.lo,w.x.lessaffluent.lo,y,w.y,max.iter = 100000)
        
        # EMD, midlo affluent, by political knowledge
        df.final$emd_midloaffluent_hiinfo_weighted_d[which(df.final$country ==
                                                             unique(mdata$country) &
                                                             df.final$year ==
                                                             unique(mdata$year))] <-
          emdw(x.midloaffluent.hi,w.x.midloaffluent.hi,y,w.y,max.iter = 100000)
        
        df.final$emd_midloaffluent_loinfo_weighted_d[which(df.final$country ==
                                                             unique(mdata$country) &
                                                             df.final$year ==
                                                             unique(mdata$year))] <-
          emdw(x.midloaffluent.lo,w.x.midloaffluent.lo,y,w.y,max.iter = 100000)
        
        # EMD, mid affluent, by political knowledge
        df.final$emd_midaffluent_hiinfo_weighted_d[which(df.final$country ==
                                                           unique(mdata$country) &
                                                           df.final$year ==
                                                           unique(mdata$year))] <-
          emdw(x.midaffluent.hi,w.x.midaffluent.hi,y,w.y,max.iter = 100000)
        
        df.final$emd_midaffluent_loinfo_weighted_d[which(df.final$country ==
                                                           unique(mdata$country) &
                                                           df.final$year ==
                                                           unique(mdata$year))] <-
          emdw(x.midaffluent.lo,w.x.midaffluent.lo,y,w.y,max.iter = 100000)
        
        # EMD, midhi affluent, by political knowledge
        df.final$emd_midhiaffluent_hiinfo_weighted_d[which(df.final$country ==
                                                             unique(mdata$country) &
                                                             df.final$year ==
                                                             unique(mdata$year))] <-
          emdw(x.midhiaffluent.hi,w.x.midhiaffluent.hi,y,w.y,max.iter = 100000)
        
        df.final$emd_midhiaffluent_loinfo_weighted_d[which(df.final$country ==
                                                             unique(mdata$country) &
                                                             df.final$year ==
                                                             unique(mdata$year))] <-
          emdw(x.midhiaffluent.lo,w.x.midhiaffluent.lo,y,w.y,max.iter = 100000)
        
        # EMD, more affluent, by political knowledge
        df.final$emd_moreaffluent_hiinfo_weighted_d[which(df.final$country ==
                                                            unique(mdata$country) &
                                                            df.final$year ==
                                                            unique(mdata$year))] <-
          emdw(x.moreaffluent.hi,w.x.moreaffluent.hi,y,w.y,max.iter = 100000)
        
        df.final$emd_moreaffluent_loinfo_weighted_d[which(df.final$country ==
                                                            unique(mdata$country) &
                                                            df.final$year ==
                                                            unique(mdata$year))] <-
          emdw(x.moreaffluent.lo,w.x.moreaffluent.lo,y,w.y,max.iter = 100000)
        
        ### meta-info
        # store mean and variance
        df.final$evar_weighted_d[which(df.final$country == unique(mdata$country) &
                                         df.final$year == unique(mdata$year))] <-
          sd(y)^2
        df.final$emean_weighted_d[which(df.final$country == unique(mdata$country) &
                                          df.final$year == unique(mdata$year))] <-
          mean(y)
        df.final$nobs_elite_weighted_d[which(df.final$country ==
                                               unique(mdata$country) &
                                               df.final$year == unique(mdata$year))] <-
          length(y)
        df.final$mscale_weighted_d[which(df.final$country ==
                                           unique(mdata$country) &
                                           df.final$year == unique(mdata$year))] <-
          paste(unique(mdata$ideology_scale_orig),collapse=", ")
        df.final$escale_weighted_d[which(df.final$country ==
                                           unique(mdata$country) &
                                           df.final$year == unique(mdata$year))] <-
          paste(unique(edata$ideology_scale_orig),collapse=", ")
      }
    }
  }
  setTxtProgressBar(pb, i)
}
# easier than looping
df.final$esurvey_weighted_d <- df.final$esurvey_weighted

### add in response rates
rr <- read.csv("./final/final-rrs.csv", stringsAsFactors = F)
rr <- rr[,c("country","year","rr_50","rr_60","rr_70","rr_80")]
df.final <- merge(df.final,rr,by=c("country","year"),all.x=T, all.y=F)

##### CHANGE SOME NAMES (LEGACY) -- labeling unweighted and relabeling weighted #####
df.final$mscale_full <- df.final$mscale_nomep <- df.final$mscale_weighted <-
  df.final$mscale_weighted_d <- NULL
tc <- which(names(df.final) %in% c("escale","evar","emean","esurvey",
                                   "emd_all","nobs_elite",
                                   "emd_lessaffluent_hiinfo",
                                   "emd_lessaffluent_loinfo",
                                   "emd_midloaffluent_hiinfo",
                                   "emd_midloaffluent_loinfo",
                                   "emd_midaffluent_hiinfo",
                                   "emd_midaffluent_loinfo",
                                   "emd_midhiaffluent_hiinfo",
                                   "emd_midhiaffluent_loinfo",
                                   "emd_moreaffluent_hiinfo",
                                   "emd_moreaffluent_loinfo",
                                   "emd_lessaffluent",
                                   "emd_midloaffluent",
                                   "emd_midaffluent",
                                   "emd_midhiaffluent",
                                   "emd_moreaffluent"))
names(df.final)[tc] <- paste(names(df.final)[tc],"unweighted",sep="_")
tc <- which(grepl("_weighted",names(df.final)))
tc <- tc[which(grepl("_d",names(df.final)[tc],fixed=T)==F)]
names(df.final)[tc] <- gsub("_weighted","",names(df.final)[tc])

### finally, reorder columns
df.final <- df.final[order(df.final$country, df.final$year),
                     c("country", "year", "iso2c", "iso3c", "region", "gdp", 
                       "gdp_log", "gini", "trade_pc_gdp", "fdi_net", 
                       "fdi_net_gdp", "fdi_net_gdp_log", "oda_pc_gdp", 
                       "ext_debt_interest", "remit_pc_gdp", "pov_pc_pop", 
                       "pov_rate", "vdem_donate", "vdem_pubfin", 
                       "vdem_polyarchy", "vdem_libdem", "vdem_delibdem", 
                       "vdem_egaldem", "vdem_partipdem", "vdem_compulsory", 
                       "vdem_turnout_vap", "vdem_turnout_vap_leg", 
                       "vdem_turnout_reg", "vdem_turnout_reg_leg", 
                       "vdem_left_gov", "vdem_party_inst", 
                       "vdem_cs_partic", "vdem_ccso_index", "vdem_corr_index", 
                       "vdem_corr_leg", "vdem_corr_ti_cpi", "vdem_clientelism", 
                       "vdem_gdp_maddison", "vdem_democracy_factored", 
                       "trade_globalization", "financial_globalization", 
                       "personal_globalization", "information_globalization", 
                       "cultural_globalization", "political_globalization", 
                       "xcut_race_religion", "xcut_race_geog", 
                       "xcut_race_income", "xcut_ethnicity_religion", 
                       "xcut_ethnicity_geog", "xcut_ethnicity_income", 
                       "xcut_lang_religion", "xcut_lang_geog", 
                       "xcut_lang_income", "xcut_religion_geog", 
                       "xcut_religion_income", "xcut_income_geog", 
                       "coll_bargaining_pc", "trade_union_density", 
                       "dreher_corruption", "democracy_duration", 
                       "dalp_clientelism", "dalp_clientelism_econ", "gini_disp", 
                       "undp_hdi", "vi_wcoord", "ictwss_cent", "ictwss_conc", 
                       "ban_corp_don_parties", "contrib_limit_parties", 
                       "limit_party_spending", "camp_fin", "camp_fin_add", 
                       "ts_ban_corp_don_parties", "ts_contrib_limit_parties", 
                       "ts_limit_party_spending", "compulsory", 
                       "disproportionality", "pr", "presidential", "fptp", 
                       "dist_mag", "mean_party_age", "pc_women_elite", 
                       "pc_women_elite_real", "clea_turnout", "left_gov", 
                       "rr_50", "rr_60", "rr_70", "rr_80", 
                       "mmean", "mmean_poor",
                       "mmean_midlo", "mmean_mid", "mmean_midhi", "mmean_rich", 
                       "mvar", "mvar_poor", "mvar_midlo", "mvar_mid", 
                       "mvar_midhi", "mvar_rich", "nobs_mass_all", 
                       "nobs_mass_poor", "nobs_mass_midlo", "nobs_mass_mid",
                       "nobs_mass_midhi", "nobs_mass_rich", "mscale", "msurvey", 
                       "mclass_var", "mknow_var", 
                       "emd_all", "emd_lessaffluent", "emd_midloaffluent", 
                       "emd_midaffluent", "emd_midhiaffluent", "emd_moreaffluent",
                       "emd_lessaffluent_loinfo", "emd_lessaffluent_hiinfo",
                       "emd_midloaffluent_loinfo", "emd_midloaffluent_hiinfo",
                       "emd_midaffluent_loinfo", "emd_midaffluent_hiinfo",
                       "emd_midhiaffluent_loinfo", "emd_midhiaffluent_hiinfo",
                       "emd_moreaffluent_loinfo", "emd_moreaffluent_hiinfo",
                       "nobs_elite", "escale", "esurvey", "emean", "evar",
                       "emd_all_unweighted","emd_lessaffluent_unweighted",
                       "emd_midloaffluent_unweighted",
                       "emd_midaffluent_unweighted",
                       "emd_midhiaffluent_unweighted",
                       "emd_moreaffluent_unweighted",
                       "emd_lessaffluent_loinfo_unweighted",
                       "emd_lessaffluent_hiinfo_unweighted",
                       "emd_midloaffluent_loinfo_unweighted",
                       "emd_midloaffluent_hiinfo_unweighted",
                       "emd_midaffluent_loinfo_unweighted",
                       "emd_midaffluent_hiinfo_unweighted",
                       "emd_midhiaffluent_loinfo_unweighted",
                       "emd_midhiaffluent_hiinfo_unweighted",
                       "emd_moreaffluent_loinfo_unweighted",
                       "emd_moreaffluent_hiinfo_unweighted",
                       "nobs_elite_unweighted", "escale_unweighted",
                       "esurvey_unweighted", "emean_unweighted",
                       "evar_unweighted", "emd_all_full",
                       "emd_lessaffluent_full",
                       "emd_midloaffluent_full",
                       "emd_midaffluent_full",
                       "emd_midhiaffluent_full",
                       "emd_moreaffluent_full",
                       "emd_lessaffluent_loinfo_full",
                       "emd_lessaffluent_hiinfo_full",
                       "emd_midloaffluent_loinfo_full",
                       "emd_midloaffluent_hiinfo_full",
                       "emd_midaffluent_loinfo_full",
                       "emd_midaffluent_hiinfo_full",
                       "emd_midhiaffluent_loinfo_full",
                       "emd_midhiaffluent_hiinfo_full",
                       "emd_moreaffluent_loinfo_full",
                       "emd_moreaffluent_hiinfo_full", "nobs_elite_full",
                       "escale_full","esurvey_full", "emean_full", "evar_full",
                       "emd_all_nomep", "emd_lessaffluent_nomep",
                       "emd_midloaffluent_nomep",
                       "emd_midaffluent_nomep",
                       "emd_midhiaffluent_nomep",
                       "emd_moreaffluent_nomep",
                       "emd_lessaffluent_loinfo_nomep",
                       "emd_lessaffluent_hiinfo_nomep",
                       "emd_midloaffluent_loinfo_nomep",
                       "emd_midloaffluent_hiinfo_nomep",
                       "emd_midaffluent_loinfo_nomep",
                       "emd_midaffluent_hiinfo_nomep",
                       "emd_midhiaffluent_loinfo_nomep",
                       "emd_midhiaffluent_hiinfo_nomep",
                       "emd_moreaffluent_loinfo_nomep",
                       "emd_moreaffluent_hiinfo_nomep", "nobs_elite_nomep",
                       "escale_nomep","esurvey_nomep", "emean_nomep",
                       "evar_nomep", "emd_all_weighted_d",
                       "emd_lessaffluent_weighted_d",
                       "emd_midloaffluent_weighted_d",
                       "emd_midaffluent_weighted_d",
                       "emd_midhiaffluent_weighted_d",
                       "emd_moreaffluent_weighted_d",
                       "emd_lessaffluent_loinfo_weighted_d",
                       "emd_lessaffluent_hiinfo_weighted_d",
                       "emd_midloaffluent_loinfo_weighted_d",
                       "emd_midloaffluent_hiinfo_weighted_d",
                       "emd_midaffluent_loinfo_weighted_d",
                       "emd_midaffluent_hiinfo_weighted_d",
                       "emd_midhiaffluent_loinfo_weighted_d",
                       "emd_midhiaffluent_hiinfo_weighted_d",
                       "emd_moreaffluent_loinfo_weighted_d",
                       "emd_moreaffluent_hiinfo_weighted_d",
                       "nobs_elite_weighted_d", "escale_weighted_d",
                       "esurvey_weighted_d", "emean_weighted_d",
                       "evar_weighted_d")]

#clean up
write.csv(df.final, "./final/final-many-to-many.csv", row.names = F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final","mass",
                           "elite","elite.full","elite.nomeps",
                           "elite.weighted"))])

##### BUILD DYADIC DATA #####
# create mass weights
mass$cy <- paste(mass$country, mass$year, sep = " ")
cy <- sort(unique(mass$cy))
pb <- txtProgressBar(min = 1, max = length(cy), style = 3)
mlist <- vector("list", length(cy))
for(i in 1:length(cy)){
  mlist[[i]] <- mass[which(mass$cy == cy[i]),]
  mlist[[i]]$w <- rep(1/nrow(mlist[[i]]), nrow(mlist[[i]]))
  setTxtProgressBar(pb, i)
}
mass <- do.call(rbind,mlist)
mass$cy <- NULL

# create ids
mass$mass_id <- seq(1,nrow(mass),1)
elite.weighted$elite_id <- seq(1,nrow(elite.weighted),1)

# change names
names(mass) <- c(paste("m", names(mass), sep="_"))
names(elite.weighted) <- c(paste("e", names(elite.weighted), sep="_"))

# set up temp storage
mlist <- as.list(NULL)
elist <- as.list(NULL)
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mlist[[i]] <- mass[which(mass$m_country == df.final$country[i] &
                             mass$m_year == df.final$year[i]),]
  elist[[i]] <- elite.weighted[which(elite.weighted$e_country ==
                                       df.final$country[i] &
                                       elite.weighted$e_year == df.final$year[i]),]
  setTxtProgressBar(pb, i)
}

### create all dyads within matching country-years. this continually appends the
### file but is less memory-intensive
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  if(nrow(mdata) == 0 | nrow(edata) == 0){
    next
  }
  
  ### drop the obs without class or ideology, since we can't use them
  mdata <- mdata[!(is.na(mdata$m_ideology_scaled)),]
  mdata <- mdata[!(is.na(mdata$m_wealth) & is.na(mdata$m_income) &
                     is.na(mdata$m_occupation)),]
  edata <- edata[!(is.na(edata$e_ideology_scaled)),]
  
  ### make the data
  dd <- expand.grid.df(mdata,edata)
  # make the EMD
  dd$emd <- abs(dd$m_ideology_scaled - dd$e_ideology_scaled)
  # make a directional statement
  dd$dist <- dd$m_ideology_scaled - dd$e_ideology_scaled
  # make a class variable
  dd$class <- dd$m_wealth
  dd$class <- ifelse(is.na(dd$class), dd$m_income, dd$class)
  dd$class <- ifelse(is.na(dd$class), dd$m_occupation, dd$class)
  # note scale differences
  dd$scalediff <- ifelse(dd$m_ideology_scale_orig == dd$e_ideology_scale_orig,
                         0, 1)
  # store it
  if(i == 1){
    write.csv(dd,"./final/final-dyads.csv", row.names = F)
  } else{
    write.table(dd,"./final/final-dyads.csv", row.names = F,
                append = T, sep=",", col.names = F)
  }
  setTxtProgressBar(pb, i)
}
# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

##### CREATE LATIN AMERICA ISSUES DATA #####
files <- list.files(path = "./PELA", pattern = "\\.sav$|\\.SAV$")
files <- paste("./PELA/",files,sep="")
# only from study 73, Argentina, have they been harmonized
files <- files[67:87]
# haven 1.1.1 doesn't allocate enough memory to lapply these files
data.list <- vector("list", length(files))
for(i in 1:length(data.list)){
  tryCatch({
    data.list[[i]] <- read_sav(files[i])
    data.list[[i]] <- as_factor(data.list[[i]])
    data.list[[i]] <- as.data.frame(data.list[[i]])
  },
  error=function(cond){
    message(paste("ERROR IN DATASET ",i,sep=""))
    message(paste(cond,"\n",sep=""))
  },
  warning=function(cond){
    message(paste("WARNING IN DATASET ",i,sep=""))
    message(paste(cond,"\n",sep=""))
  })
}
probs <- which(unlist(lapply(data.list, is.null)))
for(i in probs){
  data.list[[i]] <- read.spss(files[i], use.value.labels = TRUE,
                              use.missings = TRUE, to.data.frame = TRUE)
}
pela <- data.frame(matrix(nrow=0,ncol=12))
colnames(pela) <- c("country","leg.session","ideology","samesex",
                    "privatization","wellbeing","jobs","inequality","health",
                    "party","sex","survey")
# select just the columns we want
data.list[[1]] <- data.list[[1]][,c("Pais","legis","ID1","VAL1","ROES101",
                                    "ROES102","ROES103","ROES104","ROES106",
                                    "partido","SOCD4")]
data.list[[1]]$legis <- rep("913",dim(data.list[[1]])[1])
data.list[[2]] <- data.list[[2]][,c("Pais","legis","ID1","VAL1","ROES101",
                                    "ROES102","ROES103","ROES104","ROES106",
                                    "partido","SOCD4")]
data.list[[3]] <- data.list[[3]][,c("País","Per.leg","ID1","VAL1","ROES1_a",
                                    "ROES1_b","ROES1_c","ROES1_d","ROES1_f",
                                    "Partido","SOCD4")]
data.list[[6]] <- data.list[[6]][,c("Pais","legis","ID1","VAL1","ROES101",
                                    "ROES102","ROES103","ROES104","ROES106",
                                    "partido","SOCD4")]
data.list[[16]] <- data.list[[16]][,c("Pais","Legis","ID1","VAL1","ROES101",
                                      "ROES102","ROES103","ROES104","ROES106",
                                      "Partido","SOCD4")]
data.list[[13]] <- data.list[[13]][,c("pais","legis","ID1","VAL1","ROES101",
                                      "ROES102","ROES103","ROES104","ROES106",
                                      "PP","SOCD4")]
for(i in c(4:5,7:12,14:15,17:21)){
  data.list[[i]] <- data.list[[i]][,c("pais","legis","ID1","VAL1","ROES101",
                                      "ROES102","ROES103","ROES104","ROES106",
                                      "partido","SOCD4")]
}
# standardize colnames and merge
for(i in 1:length(data.list)){
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
  }
  data.list[[i]]$survey <- rep(files[[i]],nrow(data.list[[i]]))
  colnames(data.list[[i]]) <- colnames(pela)
  pela <- rbind(pela,data.list[[i]])
}
# clear up memory space
rm(data.list,files,i,j)
# fix country names
pela$country[pela$country == "Brasil"] <- "Brazil"
pela$country[pela$country == "Panamá"] <- "Panama"
pela$country[pela$country == "México"] <- "Mexico"
pela$country[pela$country == "Perú"] <- "Peru"
pela$country[pela$country == "República Dominicana"] <- "Dominican Republic"
# fix messed up formatting of legislative sessions
pela$leg.session[pela$leg.session == "06-11"] <- "2006-2011"
pela$leg.session[pela$leg.session == "07-10"] <- "2007-2010"
pela$leg.session[pela$leg.session == "09-11"] <- "2009-2011"
pela$leg.session[pela$leg.session == "09-14"] <- "2009-2014"
pela$leg.session[pela$leg.session == "10-14"] <- "2010-2014"
pela$leg.session[pela$leg.session == "13-17"] <- "2013-2017"
pela$leg.session[pela$leg.session == "913"] <- "2009-2013"
pela$leg.session[pela$leg.session == "1217"] <- "2012-2017"
pela$leg.session[pela$leg.session == "1318"] <- "2013-2018"
pela$leg.session[pela$leg.session == "2011"] <- "2011-2016"
# miscoded somehow
pela$leg.session[pela$survey == "./PELA/da75bra.sav"] <- "2006-2010"
pela$leg.session[pela$survey == "./PELA/da76uru.sav"] <- "2010-2015"
pela$leg.session[pela$survey == "./PELA/da85gua.sav"] <- "2012-2016"
# clean sex
pela$sex[which(pela$sex == "1" | pela$sex == "Hombre" |
                 pela$sex == "Homem")] <- "Male"
pela$sex[which(pela$sex == "2" | pela$sex == "Mujer" |
                 pela$sex == "Mulher")] <- "Female"
# create year indicators
pela$leg.begin <- str_sub(pela$leg.session, 0, 4)
pela$leg.end <- str_sub(pela$leg.session, 6, 9)
pela$leg.begin <- as.numeric(pela$leg.begin)
pela$leg.end <- as.numeric(pela$leg.end)
# clean -- make the loops robust to different variable orderings
tens <- which(colnames(pela) == "ideology" | colnames(pela) == "samesex")
sevens <- which(colnames(pela) == "privatization" |
                  colnames(pela) == "wellbeing" | colnames(pela) == "jobs" |
                  colnames(pela) == "inequality" | colnames(pela) == "health")
# other variables
for(i in tens){
  pela[which(pela[,i] == "Esquerda"),i] <- "1"
  pela[which(pela[,i] == "Izquierda (1)"),i] <- "1"
  pela[which(pela[,i] == "Desaprueba firmemente (1)"),i] <- "1"
  pela[which(pela[,i] == "Desaprova firmemente"),i] <- "1"
  pela[which(pela[,i] == "Derecha (10)"),i] <- "10"
  pela[which(pela[,i] == "Direita"),i] <- "10"
  pela[which(pela[,i] == "Aprueba firmemente (10)"),i] <- "10"
  pela[which(pela[,i] == "Aprova firmemente"),i] <- "10"
  pela[which(pela[,i] == "(2)"),i] <- "2"
  pela[which(pela[,i] == "(3)"),i] <- "3"
  pela[which(pela[,i] == "(4)"),i] <- "4"
  pela[which(pela[,i] == "(5)"),i] <- "5"
  pela[which(pela[,i] == "(6)"),i] <- "6"
  pela[which(pela[,i] == "(7)"),i] <- "7"
  pela[which(pela[,i] == "(8)"),i] <- "8"
  pela[which(pela[,i] == "(9)"),i] <- "9"
  pela[which(pela[,i] == "N.C."),i] <- NA
  pela[which(pela[,i] == "N.R."),i] <- NA
  pela[which(pela[,i] == "N.S."),i] <- NA
  pela[which(pela[,i] == "99"),i] <- NA
  pela[,i] <- as.numeric(pela[,i])
}
for(i in sevens){
  pela[which(pela[,i] == "Discorda muito"),i] <- "1"
  pela[which(pela[,i] == "Muy en desacuerdo (1)"),i] <- "1"
  pela[which(pela[,i] == "Muy de acuerdo (7)"),i] <- "7"
  pela[which(pela[,i] == "Concorda muito"),i] <- "7"
  pela[which(pela[,i] == "(2)"),i] <- "2"
  pela[which(pela[,i] == "(3)"),i] <- "3"
  pela[which(pela[,i] == "(4)"),i] <- "4"
  pela[which(pela[,i] == "(5)"),i] <- "5"
  pela[which(pela[,i] == "(6)"),i] <- "6"
  pela[which(pela[,i] == "N.C."),i] <- NA
  pela[which(pela[,i] == "N.S."),i] <- NA
  pela[which(pela[,i] == "N.R."),i] <- NA
  pela[which(pela[,i] == "99"),i] <- NA
  pela[which(pela[,i] == "9"),i] <- NA # not sure so have to NA it
  pela[,i] <- as.numeric(pela[,i])
}
# expand pela into time-series
pela.full <- data.frame(matrix(nrow=0,ncol=12))
colnames(pela.full) <- c("country","year","survey","ideology","samesex",
                         "privatization","wellbeing","jobs","inequality",
                         "health","party","sex")
for(i in 1:nrow(pela)){
  tmp <- pela[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  tmp <- tmp[,c("country","year","survey","ideology","samesex",
                "privatization","wellbeing","jobs","inequality",
                "health","party","sex")]
  pela.full <- rbind(pela.full,tmp)
}
pela <- pela.full
# now invert the econ index stuff since 1 is conservative and 7 is liberal here
pela$privatization <- pela$privatization*(-1) + 8
pela$wellbeing <- pela$wellbeing*(-1) + 8
pela$jobs <- pela$jobs*(-1) + 8
pela$inequality <- pela$inequality*(-1) + 8
pela$health <- pela$health*(-1) + 8
# now invert the same-sex marriage question so 1 is liberal and 10 is conservative
pela$samesex <- pela$samesex*(-1) + 11
# fix survey
pela$survey <- paste("PELA study",str_extract(pela$survey,"\\d{2}"),sep=" ")
# create scaled variables
pela$ideology_scaled <- rescalr(pela$ideology, 1, 10, -1, 1)
pela$samesex_scaled <- rescalr(pela$samesex, 1, 10, -1, 1)
pela$privatization_scaled <- rescalr(pela$privatization, 1, 7, -1, 1)
pela$wellbeing_scaled <- rescalr(pela$wellbeing, 1, 7, -1, 1)
pela$jobs_scaled <- rescalr(pela$jobs, 1, 7, -1, 1)
pela$inequality_scaled <- rescalr(pela$inequality, 1, 7, -1, 1)
pela$health_scaled <- rescalr(pela$health, 1, 7, -1, 1)
# fix country name
pela$country[which(grepl("xico",pela$country))] <- "Mexico"
pela$country[which(grepl("Panam",pela$country))] <- "Panama"
pela$country[which(grepl("^Per",pela$country))] <- "Peru"
pela$country[which(grepl("Dominicana",pela$country))] <- "Dominican Republic"
# clean up
pela <- pela[order(pela$country,pela$year),
             c("country","year","survey","ideology","samesex",
               "privatization","wellbeing","jobs","inequality",
               "health","ideology_scaled","samesex_scaled",
               "privatization_scaled","wellbeing_scaled","jobs_scaled",
               "inequality_scaled", "health_scaled","party","sex")]

### remove duplicate sampling
pela <- pela[-which(pela$survey == "PELA study 74" & pela$year == "2014"),]
pela <- pela[-which(pela$survey == "PELA study 78" & pela$year == "2014"),]
pela <- pela[-which(pela$survey == "PELA study 83" & pela$year == "2014"),]
pela <- pela[-which(pela$survey == "PELA study 80" & pela$year == "2011"),]

### fix party names
# Argentina 2010 and 2012
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "CC")] <- "CC ARI"
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "ARI")] <- "CC ARI"
pela$party[which(pela$survey == "PELA study 73" &  pela$year == "2010" &
                   pela$party == "UCR")] <- "CC ARI"
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "Frente para la Victoria-PJ")] <- "FPV PJ"
Sys.setlocale("LC_ALL", "es_ES.UTF-8")
pela$party[which(pela$survey == "PELA study 73" &
                   grepl("por Santiago",pela$party))] <- "Other"
# put system back in native locale
if(unname(Sys.info()["sysname"]) == "Linux"){
  Sys.setlocale("LC_ALL", "en_GB.UTF-8")
}
if(unname(Sys.info()["sysname"]) == "Darwin"){
  Sys.setlocale("LC_ALL", "en_US") 
}
if(unname(Sys.info()["sysname"]) == "Windows"){ # untested
  Sys.setlocale(category = "LC_ALL", locale = "English_United States.1252")
}
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "GEN")] <- "Other"
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "Movimiento Proyecto Sur")] <- "PRO"
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "Nuevo Encuentro Popular y Solidario")] <-
  "FPV PJ"
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "Partido Socialista")] <- "CC ARI"
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "Peronismo Federal")] <- "PRO"
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "Peronista")] <- "FPV PJ"
pela$party[which(pela$survey == "PELA study 73" &
                   pela$party == "13")] <- "Other"
# Bolivia 2010, 2012, 2014
pela$party[which(pela$survey == "PELA study 81" &
                   pela$party == "Otros partidos")] <- "Other"
# Brazil 2010
pela$party[which(pela$survey == "PELA study 75" &
                   pela$party == "PR")] <- "PL"
pela$party[which(pela$country == "Brazil" &
                   pela$year == "2010" &
                   pela$party == "PSC")] <- "Other"
pela$party[which(pela$survey == "PELA study 75" &
                   pela$party == "PTC")] <- "Other"
pela$party[which(pela$survey == "PELA study 75" &
                   pela$party == "PRB")] <- "Other"
pela$party[which(pela$survey == "PELA study 75" &
                   pela$party == "PMN")] <- "Other"
pela$party[which(pela$survey == "PELA study 75" &
                   pela$party == "PHS")] <- "Other"
# Chile 2010, 2012, 2014
pela$party[which(pela$survey == "PELA study 77" &
                   pela$party == "Otros partidos")] <- "Other"
pela$party[which(pela$survey == "PELA study 77" &
                   pela$party == "Independiente")] <- "Other"
# Colombia 2014
pela$party[which(pela$survey == "PELA study 95" &
                   grepl("Ciudad",pela$party))] <- "OC"
pela$party[which(pela$survey == "PELA study 95" &
                   grepl("Alianza",pela$party))] <- "AdlV"
# Costa Rica 2010, 2012
pela$party[which(pela$survey == "PELA study 78" &
                   grepl("Libert",pela$party))] <- "ML"
# Costa Rica 2014
pela$party[which(pela$survey == "PELA study 93" &
                   grepl("Accesibilidad",pela$party))] <- "PASE"
pela$party[which(pela$survey == "PELA study 93" &
                   grepl("Movimiento",pela$party))] <- "ML"
pela$party[which(pela$survey == "PELA study 93" &
                   grepl("Unidad",pela$party))] <- "PUSC"
pela$party[which(pela$survey == "PELA study 93" &
                   grepl("Costarri",pela$party))] <- "RC"
pela$party[which(pela$survey == "PELA study 93" &
                   grepl("Restaurac",pela$party))] <- "RN"
pela$party[which(pela$survey == "PELA study 93" &
                   grepl("Liberac",pela$party))] <- "PLN"
pela$party[which(pela$survey == "PELA study 93" &
                   grepl("Alianza",pela$party))] <- "ADC"
pela$party[which(pela$survey == "PELA study 93" &
                   grepl("Frente",pela$party))] <- "FA"
# DR 2010, 2012, 2014
pela$party[which(pela$survey == "PELA study 82" &
                   pela$party == "PRSC")] <- "Other"
# Ecuador 2013, 2014
pela$party[which(pela$survey == "PELA study 90" &
                   grepl("Carchi",pela$party))] <- "Independent"
pela$party[which(pela$survey == "PELA study 90" &
                   grepl("Region",pela$party))] <- "Independent"
pela$party[which(pela$survey == "PELA study 90" &
                   grepl("izquier",pela$party))] <- "PUL"
pela$party[which(pela$survey == "PELA study 90" &
                   grepl("Pachakutik",pela$party))] <- "PUL"
pela$party[which(pela$survey == "PELA study 90" &
                   grepl("Alianza",pela$party))] <- "PAIS"
# El Salvador 2012, 2014
pela$party[which(pela$survey == "PELA study 88" &
                   pela$party == "PES")] <- "Other"
pela$party[which(pela$survey == "PELA study 88" &
                   pela$party == "CD")] <- "Other"
# Guatemala 2010
pela$party[which(pela$survey == "PELA study 68" &
                   pela$party == "BG")] <- "Otros partidos"
# Guatemala 2012, 2014
pela$party[which(pela$survey == "PELA study 85" &
                   pela$party == "UNE")] <- "UNE-GANA"
pela$party[which(pela$survey == "PELA study 85" &
                   pela$party == "GANA")] <- "UNE-GANA"
# Honduras 2014
pela$party[which(pela$survey == "PELA study 92" &
                   pela$party == "PINU")] <- "Other"
# Panama 2014
pela$party[which(pela$survey == "PELA study 94" &
                   grepl("CAMBIO",pela$party))] <- "CD"
pela$party[which(pela$survey == "PELA study 94" &
                   grepl("PANAM",pela$party))] <- "Pista"
pela$party[which(pela$survey == "PELA study 94" &
                   grepl("INDE",pela$party))] <- "Other"
# Paraguay 2014
pela$party[which(pela$survey == "PELA study 91" &
                   grepl("AVANZA",pela$party))] <- "Other"
pela$party[which(pela$survey == "PELA study 91" &
                   grepl("FRENTE",pela$party))] <- "FG"
pela$party[which(pela$survey == "PELA study 91" &
                   pela$party == "PEN")] <- "Other"
pela$party[which(pela$survey == "PELA study 91" &
                   pela$party == "PPQ")] <- "Other"
# Peru 2010 - no idea what PNP is, doesn't fit PELA documentation
pela$party[which(pela$survey == "PELA study 80" &
                   pela$party == "PNP")] <- NA
# Peru 2011
pela$party[which(pela$survey == "PELA study 84" &
                   grepl("Posible",pela$party))] <- "PP"
pela$party[which(pela$survey == "PELA study 84" &
                   grepl("Gana",pela$party))] <- "Gana"
# Uruguay 2010
pela$party[which(pela$survey == "PELA study 76" &
                   pela$party == "Partido Independiente")] <- "PI"

### restrict to years for which we have lapop data
pela <- pela[which(pela$year %in% c("2010","2012","2014")),]

# checkpoint - save pela
write.csv(pela,"./final/pela-full.csv", row.names = F)

### Now we need to do the weighted EMDs
p <- read.csv("./partyweights/partyweights.csv", stringsAsFactors = F)

# first calculate ideology weights
pela$id <- paste0("e",as.character(seq(1,nrow(pela),1)),sep="")
elist <- vector("list",nrow(df.final))
for(i in 1:length(elist)){
  elist[[i]] <- pela[which(pela$country == df.final$country[i] &
                             pela$year == df.final$year[i]),]
}
for(i in 1:length(elist)){
  ### work in a temporary environment
  tmp <- elist[[i]]
  
  ### if no data/wrong country then skip it
  if(nrow(tmp[which(!(is.na(tmp$ideology))),]) == 0){
    elist[[i]] <- tmp[which(!(is.na(tmp$ideology))),] #i.e., delete all obs
    rm(tmp)
    next
  }
  
  ### if no party or sex info, just weight everything evenly
  if(all(is.na(unique(tmp$party))) & all(is.na(unique(tmp$sex)))){
    tmp <- tmp[!is.na(tmp$ideology_scaled),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(tmp)
    next
  }
  
  ### if just no party info, weight on sex only
  if(all(is.na(unique(tmp$party)))){
    tmp <- tmp[which(!(is.na(tmp$ideology_scaled)) & !(is.na(tmp$sex))),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    ww <- df.final$pc_women_elite_real[i]
    mw <- 1 - ww
    men <- tmp[which(tmp$sex == "Male"),]
    women <- tmp[which(tmp$sex == "Female"),]
    men$w <- (mw/sum(men$w))*men$w
    women$w <- (ww/sum(women$w))*women$w
    tmp <- rbind(women,men)
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(ww,mw,men,women,tmp)
    next
  }
  
  ### if no sex info, weight on party only
  if(all(is.na(unique(tmp$sex)))){
    tmp <- tmp[which(!(is.na(tmp$ideology_scaled)) & !(is.na(tmp$party))),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    ps <- sort(unique(tmp$party))
    for(j in 1:length(ps)){
      pw <- p$weight[which(p$country == df.final$country[i] &
                             p$year == df.final$year[i] &
                             p$party == ps[j])]
      stopifnot(!(is.na(pw)))
      tmp$w[which(tmp$party == ps[j])] <-
        (pw/sum(tmp$w[which(tmp$party ==
                              ps[j])]))*tmp$w[which(tmp$party == ps[j])]
    }
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(pw,ps,tmp)
    next
  }
  
  ### if both, weight on both
  tmp <- tmp[which(!(is.na(tmp$ideology_scaled)) &
                     !(is.na(tmp$sex)) & !(is.na(tmp$party))),]
  tmp$w <- rep(1/nrow(tmp),nrow(tmp))
  
  # get the gender weights
  ww <- df.final$pc_women_elite_real[i]
  mw <- 1 - ww
  # break it into party/sex binaries, then weight and recombine
  obslist <- split(tmp, f=list(tmp$sex,tmp$party))
  for(j in 1:length(obslist)){
    pw <- p$weight[which(p$country == df.final$country[i] &
                           p$year == df.final$year[i] &
                           p$party == unique(obslist[[j]]$party))]
    sw <- ifelse(unique(obslist[[j]]$sex) == "Male",mw,ww)
    W <- pw*sw
    obslist[[j]]$w <- (W/sum(obslist[[j]]$w))*obslist[[j]]$w
    rm(pw,sw,W) # so no accidental duplicates
  }
  tmp <- do.call(rbind,obslist)
  stopifnot(abs(1 - sum(tmp$w)) < .1)
  # sometimes parties only have 1 gender, in which case this is slightly off
  # not much we can do, but the size of these errors are small.
  # reweight in such cases -- always women need to be upweighted
  if(all.equal(sum(tmp$w),1) != T){
    diff <- 1-sum(tmp$w)
    tmp$w[which(tmp$sex == "Female")] <-
      (tmp$w[which(tmp$sex == "Female")] +
         diff/nrow(tmp[which(tmp$sex == "Female"),]))
    tmp$w <- tmp$w*(1/sum(tmp$w))
    stopifnot(abs(ww - sum(tmp$w[which(tmp$sex == "Female")])) < .05)
    rm(diff)
  }
  stopifnot(all.equal(sum(tmp$w),1))
  tmp$w_d <- tmp$w
  elist[[i]] <- tmp
  rm(mw,ww,tmp,obslist)
}
pela.weighted <- do.call(rbind, elist)
pela.weighted <- pela.weighted[,c("id","w","w_d")]
names(pela.weighted)[c(2:3)] <- c("ideology_weight","ideology_weight_d")
pela <- merge(pela,pela.weighted,by="id",all=T)

# now calculate same-sex marriage weights
elist <- vector("list",nrow(df.final))
for(i in 1:length(elist)){
  elist[[i]] <- pela[which(pela$country == df.final$country[i] &
                             pela$year == df.final$year[i]),]
}
for(i in 1:length(elist)){
  ### work in a temporary environment
  tmp <- elist[[i]]
  
  ### if no data/wrong country then skip it
  if(nrow(tmp[which(!(is.na(tmp$samesex))),]) == 0){
    elist[[i]] <- tmp[which(!(is.na(tmp$samesex))),]
    rm(tmp)
    next
  }
  
  ### if no party or sex info, just weight everything evenly
  if(all(is.na(unique(tmp$party))) & all(is.na(unique(tmp$sex)))){
    tmp <- tmp[!is.na(tmp$samesex_scaled),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(tmp)
    next
  }
  
  ### if just no party info, weight on sex only
  if(all(is.na(unique(tmp$party)))){
    tmp <- tmp[which(!(is.na(tmp$samesex_scaled)) & !(is.na(tmp$sex))),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    ww <- df.final$pc_women_elite_real[i]
    mw <- 1 - ww
    men <- tmp[which(tmp$sex == "Male"),]
    women <- tmp[which(tmp$sex == "Female"),]
    men$w <- (mw/sum(men$w))*men$w
    women$w <- (ww/sum(women$w))*women$w
    tmp <- rbind(women,men)
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(ww,mw,men,women,tmp)
    next
  }
  
  ### if no sex info, weight on party only
  if(all(is.na(unique(tmp$sex)))){
    tmp <- tmp[which(!(is.na(tmp$samesex_scaled)) & !(is.na(tmp$party))),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    ps <- sort(unique(tmp$party))
    for(j in 1:length(ps)){
      pw <- p$weight[which(p$country == df.final$country[i] &
                             p$year == df.final$year[i] &
                             p$party == ps[j])]
      stopifnot(!(is.na(pw)))
      tmp$w[which(tmp$party == ps[j])] <-
        (pw/sum(tmp$w[which(tmp$party ==
                              ps[j])]))*tmp$w[which(tmp$party == ps[j])]
    }
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(pw,ps,tmp)
    next
  }
  
  ### if both, weight on both
  tmp <- tmp[which(!(is.na(tmp$samesex_scaled)) &
                     !(is.na(tmp$sex)) & !(is.na(tmp$party))),]
  tmp$w <- rep(1/nrow(tmp),nrow(tmp))
  
  # get the gender weights
  ww <- df.final$pc_women_elite_real[i]
  mw <- 1 - ww
  # break it into party/sex binaries, then weight and recombine
  obslist <- split(tmp, f=list(tmp$sex,tmp$party))
  for(j in 1:length(obslist)){
    pw <- p$weight[which(p$country == df.final$country[i] &
                           p$year == df.final$year[i] &
                           p$party == unique(obslist[[j]]$party))]
    sw <- ifelse(unique(obslist[[j]]$sex) == "Male",mw,ww)
    W <- pw*sw
    obslist[[j]]$w <- (W/sum(obslist[[j]]$w))*obslist[[j]]$w
    rm(pw,sw,W) # so no accidental duplicates
  }
  tmp <- do.call(rbind,obslist)
  stopifnot(abs(1 - sum(tmp$w)) < .1)
  # sometimes parties only have 1 gender, in which case this is slightly off
  # not much we can do, but the size of these errors are small.
  # reweight in such cases -- always women need to be upweighted
  if(all.equal(sum(tmp$w),1) != T){
    diff <- 1-sum(tmp$w)
    tmp$w[which(tmp$sex == "Female")] <-
      (tmp$w[which(tmp$sex == "Female")] +
         diff/nrow(tmp[which(tmp$sex == "Female"),]))
    tmp$w <- tmp$w*(1/sum(tmp$w))
    stopifnot(abs(ww - sum(tmp$w[which(tmp$sex == "Female")])) < .05)
    rm(diff)
  }
  stopifnot(all.equal(sum(tmp$w),1))
  tmp$w_d <- tmp$w
  elist[[i]] <- tmp
  rm(mw,ww,tmp,obslist)
}
pela.weighted <- do.call(rbind, elist)
pela.weighted <- pela.weighted[,c("id","w","w_d")]
names(pela.weighted)[c(2:3)] <- c("samesex_weight","samesex_weight_d")
pela <- merge(pela,pela.weighted,by="id",all=T)

# save the cleaned version
write.csv(pela,"./final/final-pela-weighted.csv", row.names = F)

# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final",
                           "pela","p"))])

### Clean LAPOP again
# Also NB: as of 18-Sep-17, the public merge file does not include Brazil 2010
# D6 data for some reason; so we will merge it from the country file directly.
lapop <- read_dta("./LAPOP/AmericasBarometer Grand Merge 2004-2014.dta", 
                  encoding = "latin1")
lapop <- as_factor(lapop)
lapop <- as.data.frame(lapop)
for(i in 1:ncol(lapop)){
  lapop[,i] <- as.character(lapop[,i])
}
brz <- read_dta("./LAPOP/Brazil-2010.dta")
brz <- as_factor(brz)
brz <- as.data.frame(brz)
for(i in 1:ncol(brz)){
  brz[,i] <- as.character(brz[,i])
}
# keep what we need
knowledge <- which(grepl("^gi", colnames(lapop)))
numerics <- which(grepl("^r\\d", colnames(lapop)))
lapop <- lapop[,c("pais","year","l1","d6","ros1","ros2","ros3","ros4","ros6",
                  "q10","ed","ocup1",colnames(lapop)[c(numerics,knowledge)])]
names(lapop)[c(1:12)] <- c("country","year","ideology","samesex",
                           "privatization","wellbeing","jobs","inequality",
                           "health","income","education","occupation")
lapop$gi0 <- lapop$gi6 <- lapop$gi7 <- NULL # only factual knowledge questions
# harmonize Brazil country file variables
brz$r2 <- brz$r13 <- brz$r17 <- brz$r26 <- brz$r4b <- rep(NA,nrow(brz))
brz$gi2 <- brz$gi4usa <- brz$gi5 <- brz$gi7r <- brz$gix4 <- rep(NA,nrow(brz))
brz$year <- rep("2010", nrow(brz))
brz$country <- rep("Brazil", nrow(brz))
brz <- brz[,c("country", "year", "l1", "d6", "ros1", "ros2", "ros3", "ros4", 
              "ros6", "q10", "ed", "ocup1", "r1", "r2", "r3", "r4", "r4a", 
              "r4b", "r5", "r6", "r7", "r8", "r12", "r13", "r14", "r15", "r16", 
              "r17", "r18", "r20", "r21", "r22", "r23", "r24", "r25", "r26", 
              "gl1", "gi2", "gl3", "gl4", "gi4usa", "gi5", "gi7r", "gix4")]
names(brz) <- names(lapop)
for(i in 1:ncol(brz)){
  brz[,i] <- as.character(brz[,i])
}
# delete Brazil from merge and replace it with real Brazil data, inc. d6
lapop <- lapop[-which(lapop$country == "Brasil" & lapop$year == "2010"),]
lapop <- rbind(lapop,brz)
# make the loops robust to different variable orderings
tens <- which(colnames(lapop) == "ideology" | colnames(lapop) == "samesex")
sevens <- which(colnames(lapop) == "privatization" |
                  colnames(lapop) == "wellbeing" |
                  colnames(lapop) == "jobs" |
                  colnames(lapop) == "inequality" |
                  colnames(lapop) == "health")
# fix DVs on a 1-10 scale
for(i in tens){
  lapop[which(lapop[,i] == "1 Desaprueba firmemente"),i] <- "1"
  lapop[which(lapop[,i] == "Desaprueba firmemente"),i] <- "1"
  lapop[which(lapop[,i] == "Desaprova Fortemente"),i] <- "1"
  lapop[which(lapop[,i] == "1 Izquierda"),i] <- "1"
  lapop[which(lapop[,i] == "Izquierda"),i] <- "1"
  lapop[which(lapop[,i] == "Esquerda"),i] <- "1"
  lapop[which(lapop[,i] == "10 Aprueba firmemente"),i] <- "10"
  lapop[which(lapop[,i] == "Aprueba firmemente"),i] <- "10"
  lapop[which(lapop[,i] == "Apoia Fortemente"),i] <- "10"
  lapop[which(lapop[,i] == "10 Derecha"),i] <- "10"
  lapop[which(lapop[,i] == "Derecha"),i] <- "10"
  lapop[which(lapop[,i] == "Direita"),i] <- "10"
  lapop[which(lapop[,i] == "No Aplica"),i] <- NA
  lapop[which(lapop[,i] == "No Responde"),i] <- NA
  lapop[which(lapop[,i] == "No Sabe"),i] <- NA
  lapop[which(lapop[,i] == "No se le preguntó en este país o año"),i] <- NA
  lapop[which(lapop[,i] == "N/A"),i] <- NA
  lapop[which(lapop[,i] == "DK"),i] <- NA
  lapop[which(lapop[,i] == "NR"),i] <- NA
  lapop[,i] <- as.numeric(lapop[,i])
}
# fix DVs on a 1-7 scale
for(i in sevens){
  lapop[which(lapop[,i] == "Muy en desacuerdo"),i] <- "1"
  lapop[which(lapop[,i] == "Discorda Muito"),i] <- "1"
  lapop[which(lapop[,i] == "(2)"),i] <- "2"
  lapop[which(lapop[,i] == "(3)"),i] <- "3"
  lapop[which(lapop[,i] == "(4)"),i] <- "4"
  lapop[which(lapop[,i] == "(5)"),i] <- "5"
  lapop[which(lapop[,i] == "(6)"),i] <- "6"
  lapop[which(lapop[,i] == "Muy de acuerdo"),i] <- "7"
  lapop[which(lapop[,i] == "Concorda Muito"),i] <- "7"
  lapop[which(lapop[,i] == "No Aplica"),i] <- NA
  lapop[which(lapop[,i] == "No Responde"),i] <- NA
  lapop[which(lapop[,i] == "No Sabe"),i] <- NA
  lapop[which(lapop[,i] == "No se le preguntó en este país o año"),i] <- NA
  lapop[which(lapop[,i] == "DK"),i] <- NA
  lapop[which(lapop[,i] == "NR"),i] <- NA
  lapop[which(lapop[,i] == "N/A"),i] <- NA
  lapop[,i] <- as.numeric(lapop[,i])
}
# now invert the econ questions since 7=agree with leftist position
lapop$privatization <- lapop$privatization*(-1) + 8
lapop$wellbeing <- lapop$wellbeing*(-1) + 8
lapop$jobs <- lapop$jobs*(-1) + 8
lapop$inequality <- lapop$inequality*(-1) + 8
lapop$health <- lapop$health*(-1) + 8
# now invert same-sex marriage since 10=approve of leftist position
lapop$samesex <- lapop$samesex*(-1) + 11
# fix country names
lapop$country[lapop$country == "Belice"] <- "Belize"
lapop$country[lapop$country == "Brasil"] <- "Brazil"
lapop$country[lapop$country == "Canadá"] <- "Canada"
lapop$country[lapop$country == "Estados Unidos"] <- "United States"
lapop$country[lapop$country == "Haití"] <- "Haiti"
lapop$country[lapop$country == "México"] <- "Mexico"
lapop$country[lapop$country == "Panamá"] <- "Panama"
lapop$country[lapop$country == "Perú"] <- "Peru"
lapop$country[lapop$country == "República Dominicana"] <- "Dominican Republic"
lapop$country[lapop$country == "Trinidad y Tobago"] <- "Trinidad and Tobago"
lapop$country[lapop$country == "Venezuela"] <- "Venezuela, RB"
# fix occupational data
lapop$occupation[which(lapop$occupation == "4" |
                         lapop$occupation == "6" |
                         lapop$occupation == "8" |
                         lapop$occupation == "9" |
                         lapop$occupation == "11" |
                         lapop$occupation == "12" |
                         lapop$occupation == "13" |
                         lapop$occupation == "14" |
                         lapop$occupation == "Agricultor, ou produtor agropecu\xe1rio e pesqueiro" |
                         lapop$occupation == "Agricultor, ou produtor agropecuário e pesqueiro" |
                         lapop$occupation == "Artes\xe3o" |
                         lapop$occupation == "Artesano" |
                         lapop$occupation == "Artesão" |
                         lapop$occupation == "Empregado Agr\xedcola (trabalha nas terras de outros)" |
                         lapop$occupation == "Empregado Agrícola (trabalha nas terras de outros)" |
                         lapop$occupation == "Peón agrícola (trabaja la tierra para otros)" |
                         lapop$occupation == "Servi\xe7o dom\xe9stico" |
                         lapop$occupation == "Servicio doméstico" |
                         lapop$occupation == "Serviço doméstico" |
                         lapop$occupation == "Oficinista (secretaria, operador de maquina de oficina, etc.)" |
                         lapop$occupation == "Campesino, agricultor, o productor agropecuario y pesquero" |
                         lapop$occupation == "Obrero" |
                         lapop$occupation == "Empleado, fuera de oficina, en el sector de servicios" |
                         lapop$occupation == "Vendedor demostrador en almacenes y mercados" |
                         lapop$occupation == "Vendedor em armazéns ou mercados" |
                         lapop$occupation == "Vendedor em armaz\xe9ns ou mercados")] <- "worker"
lapop$occupation[which(lapop$occupation == "1" |
                         lapop$occupation == "2" |
                         lapop$occupation == "3" |
                         lapop$occupation == "5" |
                         lapop$occupation == "7" |
                         lapop$occupation == "Profissional liberal, intelectual e cient\xedfico" |
                         lapop$occupation == "Profesional, intelectual y científico" |
                         lapop$occupation == "Profissional liberal, intelectual e científico" |
                         lapop$occupation == "Trabalhador especializado" |
                         lapop$occupation == "Trabajador especializado (operador de maquinaria, albañil, etc.)" |
                         lapop$occupation == "T\xe9cnico ou profissional de n\xedvel m\xe9dio" |
                         lapop$occupation == "Técnico ou profissional de nível médio" |
                         lapop$occupation == "Técnico o profesional de nivel medio (técnico en computación, etc.)" |
                         lapop$occupation == "Diretor (gerente, chefe de departamento, supervisor)" |
                         lapop$occupation == "Director (gerente, jefe de departamento, supervisor)" |
                         lapop$occupation == "Empregados no setor de serviços" |
                         lapop$occupation == "Empregados no setor de servi\xe7os")] <- "professional"
lapop$occupation[which(lapop$occupation == "10" |
                         lapop$occupation == "15" |
                         lapop$occupation == "Comerciante" |
                         lapop$occupation == "Comerciante (vendedor ambulante, propietario de establecimiento, etc.)" |
                         lapop$occupation == "Funcion\xe1rios do governo" |
                         lapop$occupation == "Funcionários do governo" |
                         lapop$occupation == "Funcionario del gobierno (miembro de los órganos legislativo, etc.)" |
                         lapop$occupation == "Membro das For\xe7as Armadas ou pessoal de servi\xe7o de prote\xe7\xe3o e seguran\xe7a" |
                         lapop$occupation == "Membro das Forças Armadas ou pessoal de serviço de proteção e segurança" |
                         lapop$occupation == "Miembro de las fuerzas armadas o personal de servicio de protección y seguridad" |
                         lapop$occupation == "Oper\xe1rio" |
                         lapop$occupation == "Operário" |
                         lapop$occupation == "Pessoal de apoio")] <- "other"
lapop$occupation[which(lapop$occupation == "0" |
                         lapop$occupation == "20" |
                         lapop$occupation == "No Aplica" |
                         lapop$occupation == "No Responde" |
                         lapop$occupation == "No Sabe" |
                         lapop$occupation == "No se le preguntó en este país o año" |
                         lapop$occupation == "77")] <- NA
lapop$occupation <- as.factor(lapop$occupation)
# fix material wealth -- make it binary, then export and factor by country-year
numerics <- which(grepl("r\\d", colnames(lapop)))
for(i in numerics){
  lapop[which(lapop[,i] == "Sí"),i] <- "1"
  lapop[which(lapop[,i] == "Sim"),i] <- "1"
  lapop[which(lapop[,i] == "Yes"),i] <- "1"
  lapop[which(lapop[,i] == "Uno"),i] <- "1"         
  lapop[which(lapop[,i] == "Dos"),i] <- "1"         # for cars
  lapop[which(lapop[,i] == "Dois"),i] <- "1"
  lapop[which(lapop[,i] == "Tres o más"),i] <- "1"  # for cars
  lapop[which(lapop[,i] == "Tr\xeas ou mais"),i] <- "1"
  lapop[which(lapop[,i] == "Três ou mais"),i] <- "1"
  lapop[which(lapop[,i] == "No"),i] <- "0"
  lapop[which(lapop[,i] == "N\xe3o"),i] <- "0"
  lapop[which(lapop[,i] == "Não"),i] <- "0"
  lapop[which(lapop[,i] == "No se le preguntó en este país o año"),i] <- NA
  lapop[which(lapop[,i] == "No Responde"),i] <- NA
  lapop[which(lapop[,i] == "No Sabe"),i] <- NA
  lapop[which(lapop[,i] == "NR"),i] <- NA
  lapop[which(lapop[,i] == "No Aplica"),i] <- NA
  lapop[,i] <- factor(lapop[,i], levels=c("0","1"))
}
# clean knowledge questions
knowledge <- which(grepl("gi", colnames(lapop)))
# a few special cases
lapop$gi3[which(lapop$gi3 == "No Sabe")] <- NA
lapop$gi3[which(lapop$gi3 == "No Responde")] <- NA
lapop$gi3[which(lapop$gi3 == "No Aplica")] <- NA
lapop$gi3[which(lapop$gi3 == "No se le preguntó en este país o año")] <- NA
lapop$gi3[which(lapop$gi3 != "50" & !(is.na(lapop$gi3)))] <- "0"
lapop$gi3[which(lapop$gi3 == "50")] <- "1"
lapop$gi4usa[which(lapop$gi4usa == "No Sabe")] <- NA
lapop$gi4usa[which(lapop$gi4usa == "No Responde")] <- NA
lapop$gi4usa[which(lapop$gi4usa == "No Aplica")] <- NA
lapop$gi4usa[which(lapop$gi4usa == "No se le preguntó en este país o año")] <- NA
lapop$gi4usa[which(lapop$gi4usa != "4" & !(is.na(lapop$gi4usa)))] <- "0"
lapop$gi4usa[which(lapop$gi4usa == "4")] <- "1"
for(i in knowledge){
  lapop[which(lapop[,i] == "No Sabe"),i] <- NA
  lapop[which(lapop[,i] == "No Responde"),i] <- NA
  lapop[which(lapop[,i] == "No Aplica"),i] <- NA
  lapop[which(lapop[,i] == "No se le preguntó en este país o año"),i] <- NA
  lapop[which(lapop[,i] == "Correcto"),i] <- "1"
  lapop[which(lapop[,i] == "Correto"),i] <- "1"
  lapop[which(lapop[,i] == "Incorrecto"),i] <- "0"
  lapop[which(lapop[,i] == "Incorrecto/NS"),i] <- "0"
  lapop[which(lapop[,i] == "Incorreto"),i] <- "0"
}
# fix Brazil education
lapop$education[which(lapop$education %in% c("No Aplica", "No Responde",
                                             "No se le preguntó en este país o año",
                                             "No Sabe"))] <- NA
lapop$education[which(lapop$education == "Nenhum")] <- "0"
lapop$education[which(lapop$education == "Ninguno")] <- "0"
lapop$education[which(lapop$education == "Prim\xe1rio (s\xe9ries) 1")] <- "1"
lapop$education[which(lapop$education == "Primário (séries) 1")] <- "1"
lapop$education[which(lapop$education == "Prim\xe1rio (s\xe9ries) 2")] <- "2"
lapop$education[which(lapop$education == "Primário (séries) 2")] <- "2"
lapop$education[which(lapop$education == "Prim\xe1rio (s\xe9ries) 3")] <- "3"
lapop$education[which(lapop$education == "Primário (séries) 3")] <- "3"
lapop$education[which(lapop$education == "Prim\xe1rio (s\xe9ries) 4")] <- "4"
lapop$education[which(lapop$education == "Primário (séries) 4")] <- "4"
lapop$education[which(lapop$education == "Prim\xe1rio (s\xe9ries) 5")] <- "5"
lapop$education[which(lapop$education == "Primário (séries) 5")] <- "5"
lapop$education[which(lapop$education == "Prim\xe1rio (s\xe9ries) 6")] <- "6"
lapop$education[which(lapop$education == "Primário (séries) 6")] <- "6"
lapop$education[which(lapop$education == "Prim\xe1rio (s\xe9ries) 7")] <- "7"
lapop$education[which(lapop$education == "Primário (séries) 7")] <- "7"
lapop$education[which(lapop$education == "Prim\xe1rio (s\xe9ries) 8")] <- "8"
lapop$education[which(lapop$education == "Primário (séries) 8")] <- "8"
lapop$education[which(lapop$education == "Secund\xe1rio (s\xe9ries) 1")] <- "9"
lapop$education[which(lapop$education == "Secundário (séries) 1")] <- "9"
lapop$education[which(lapop$education == "Secund\xe1rio (s\xe9ries) 2")] <- "10"
lapop$education[which(lapop$education == "Secundário (séries) 2")] <- "10"
lapop$education[which(lapop$education == "Secund\xe1rio (s\xe9ries) 3")] <- "11"
lapop$education[which(lapop$education == "Secundário (séries) 3")] <- "11"
lapop$education[which(lapop$education == 
                        "Universidade (anos)/Ensino Superior n\xe3o-universit\xe1rio (anos)  1")] <- "12"
lapop$education[which(lapop$education == 
                        "Universidade (anos)/Ensino Superior não-universitário (anos)  1")] <- "12"
lapop$education[which(lapop$education == 
                        "Universidade (anos)/Ensino Superior n\xe3o-universit\xe1rio (anos) 2")] <- "13"
lapop$education[which(lapop$education == 
                        "Universidade (anos)/Ensino Superior não-universitário (anos) 2")] <- "13"
lapop$education[which(lapop$education == 
                        "Universidade (anos)/Ensino Superior n\xe3o-universit\xe1rio (anos) 3")] <- "14"
lapop$education[which(lapop$education == 
                        "Universidade (anos)/Ensino Superior não-universitário (anos) 3")] <- "14"
lapop$education[which(lapop$education == 
                        "Universidade (anos)/Ensino Superior n\xe3o-universit\xe1rio (anos) 4")] <- "15"
lapop$education[which(lapop$education == 
                        "Universidade (anos)/Ensino Superior não-universitário (anos) 4")] <- "15"
lapop$education[which(lapop$education ==  "Universidade (anos) 5")] <- "13"
lapop$education[which(lapop$education ==  "Universidade (anos) 6")] <- "14"
lapop$education[which(lapop$education ==  "18+")] <- "18"
# fix Brazil income
lapop$income[which(lapop$income %in% c("No Aplica", "No Responde", "No Sabe",
                                       "No se le preguntó en este país o año"))] <- NA
lapop$income[which(lapop$income == "Sem Renda")] <- "0"
lapop$income[which(lapop$income == "At\xe9 R$ 510,00")] <- "1"
lapop$income[which(lapop$income == "Até R$ 510,00")] <- "1"
lapop$income[which(lapop$income == "De R$ 510,01 at\xe9 R$ 1020,00")] <- "2"
lapop$income[which(lapop$income == "De R$ 510,01 até R$ 1020,00")] <- "2"
lapop$income[which(lapop$income == "De R$ 1020,01 at\xe9 R$ 1.530,00")] <- "3"
lapop$income[which(lapop$income == "De R$ 1020,01 até R$ 1.530,00")] <- "3"
lapop$income[which(lapop$income == "De R$ 1.530,01 at\xe9 R$ 2.550,00")] <- "4"
lapop$income[which(lapop$income == "De R$ 1.530,01 até R$ 2.550,00")] <- "4"
lapop$income[which(lapop$income == "De R$ 2.550,01 at\xe9 R$ 3.570,00")] <- "5"
lapop$income[which(lapop$income == "De R$ 2.550,01 até R$ 3.570,00")] <- "5"
lapop$income[which(lapop$income == "De R$ 3.570,01 at\xe9 R$ 4.080,00")] <- "6"
lapop$income[which(lapop$income == "De R$ 3.570,01 até R$ 4.080,00")] <- "6"
lapop$income[which(lapop$income == "De R$ 4.080,01 at\xe9 R$ 6.120,00" )] <- "7"
lapop$income[which(lapop$income == "De R$ 4.080,01 até R$ 6.120,00" )] <- "7"
lapop$income[which(lapop$income == "De R$ 6.120,01at\xe9 R$ 7.650,00")] <- "8"
lapop$income[which(lapop$income == "De R$ 6.120,01até R$ 7.650,00")] <- "8"
lapop$income[which(lapop$income == "De R$ 7.650,01 at\xe9 R$ 10.200,00")] <- "9"
lapop$income[which(lapop$income == "De R$ 7.650,01 até R$ 10.200,00")] <- "9"
lapop$income[which(lapop$income == "Mais de R$ 10.200,01")] <- "10"
# factor knowledge
for(i in knowledge){
  lapop[,i] <- as.factor(lapop[,i])
}
# split into each country-year
templist <- split(lapop, f = list(lapop$country, lapop$year))
templist <- templist[lapply(templist,nrow)>0]
# factor information
for(i in 1:length(templist)){
  print(i)
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  knowmat <- cbind(templist[[i]]$id,templist[[i]][,knowledge])
  colnames(knowmat)[1] <- "id"
  # remove all NA columns
  for(j in ncol(knowmat):2){       # 2 since column 1 is IDs
    if(all(is.na(unique(knowmat[,j]))) == T){
      knowmat <- knowmat[,-j]
    }
  }
  # if no durable goods questions were asked, we skip the country-year
  if(is.null(dim(knowmat)) == T){
    templist[[i]]$knowledge <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country", "year", "ideology",
                                      "samesex", "privatization",
                                      "wellbeing", "jobs", "inequality",
                                      "health", "income", "education",
                                      "occupation", "r1", "r2", "r3", "r4",
                                      "r4a", "r4b", "r5", "r6", "r7", "r8",
                                      "r12", "r13", "r14", "r15", "r16",
                                      "r17", "r18", "r20", "r21", "r22",
                                      "r23", "r24", "r25", "r26", "knowledge")]
    next
  }
  # take only complete cases and factor -- sometimes this eliminates all obs,
  # so drop one question in that case
  kmat <- knowmat[complete.cases(knowmat),]
  if(dim(kmat)[2] == 2){
    colnames(knowmat)[2] <- "knowledge"
    stopifnot(nrow(templist[[i]]) == nrow(knowmat))
    knowmat$knowledge <- as.character(knowmat$knowledge)
    knowmat$knowledge[which(knowmat$knowledge == "1")] <- "4"
    templist[[i]]$knowledge <- knowmat$knowledge
    templist[[i]] <- templist[[i]][,c("country", "year", "ideology",
                                      "samesex", "privatization",
                                      "wellbeing", "jobs", "inequality",
                                      "health", "income", "education",
                                      "occupation", "r1", "r2", "r3", "r4",
                                      "r4a", "r4b", "r5", "r6", "r7", "r8",
                                      "r12", "r13", "r14", "r15", "r16",
                                      "r17", "r18", "r20", "r21", "r22",
                                      "r23", "r24", "r25", "r26", "knowledge")]
    next
  }
  # MCA
  facs <- MCA(kmat[,-1], ncp = 5, graph = F)
  # save first factor score
  kmat$knowledge <- facs$ind$coord[,1]
  knowmat <- kmat
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(knowmat)-1)){
    knowmat[,j] <- as.numeric(as.character(knowmat[,j]))
  }
  knowmat <- knowmat[order(rowSums(knowmat[,
                                           2:(ncol(knowmat)-1)]),decreasing=T),]
  if(as.logical(head(knowmat$knowledge, 1) < tail(knowmat$knowledge, 1)) == T){
    knowmat$knowledge <- knowmat$knowledge*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],knowmat, by = "id", all.x = T, all.y = T)
  templist[[i]] <- templist[[i]][,c("id","country", "year", "ideology",
                                    "samesex", "privatization",
                                    "wellbeing", "jobs", "inequality",
                                    "health", "income", "education",
                                    "occupation", "r1", "r2", "r3", "r4",
                                    "r4a", "r4b", "r5", "r6", "r7", "r8",
                                    "r12", "r13", "r14", "r15", "r16",
                                    "r17", "r18", "r20", "r21", "r22",
                                    "r23", "r24", "r25", "r26", "knowledge")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$knowledge <- jitter(as.numeric(tmpdf$knowledge), factor=10e-8)
  # jitter to allow quantization
  knowqnt <- as.character(qcut(tmpdf$knowledge,5)) # just to get labels
  stopifnot(mean(tmpdf$knowledge[which(knowqnt == max(as.numeric(knowqnt),
                                                      na.rm=T))]) >
              mean(tmpdf$knowledge[which(knowqnt == min(as.numeric(knowqnt),
                                                        na.rm=T))]))
  # make sure factor levels align
  tmpdf$knowledge <- knowqnt
  tmpdf <- tmpdf[,c("id","knowledge")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$knowledge.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "knowledge.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "knowledge"
  templist[[i]] <- templist[[i]][,c("country", "year", "ideology",
                                    "samesex", "privatization",
                                    "wellbeing", "jobs", "inequality",
                                    "health", "income", "education",
                                    "occupation", "r1", "r2", "r3", "r4",
                                    "r4a", "r4b", "r5", "r6", "r7", "r8",
                                    "r12", "r13", "r14", "r15", "r16",
                                    "r17", "r18", "r20", "r21", "r22",
                                    "r23", "r24", "r25", "r26", "knowledge")]
}
numerics <- which(grepl("r\\d", colnames(templist[[1]])))
# factor material wealth
for(i in 1:length(templist)){
  print(i)
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  matgoods <- cbind(templist[[i]]$id,templist[[i]][,numerics])
  colnames(matgoods)[1] <- "id"
  # remove all NA columns
  for(j in ncol(matgoods):2){                          # 2 since column 1 is IDs
    if(all(is.na(unique(matgoods[,j]))) == T){
      matgoods <- matgoods[,-j]
    }
  }
  # if no durable goods questions were asked, we skip the country-year
  if(is.null(dim(matgoods)) == T){
    templist[[i]]$wealth <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country", "year", "ideology",
                                      "samesex", "privatization",
                                      "wellbeing", "jobs", "inequality",
                                      "health", "income", "education",
                                      "occupation", "wealth", "knowledge")]
    next
  }
  # take only complete cases and factor
  matgoods <- matgoods[complete.cases(matgoods),]
  facs <- MCA(matgoods[,-1], ncp = 5, graph = F)
  # save first factor score
  matgoods$wealth <- facs$ind$coord[,1]
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(matgoods)-1)){
    matgoods[,j] <- as.numeric(as.character(matgoods[,j]))
  }
  matgoods <- matgoods[order(rowSums(matgoods[,2:(ncol(matgoods)-1)]),
                             decreasing=T),]
  if(as.logical(head(matgoods$wealth, 1) < tail(matgoods$wealth, 1)) == T){
    matgoods$wealth <- matgoods$wealth*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],matgoods, by = "id", all.x = T,
                         all.y = T)
  templist[[i]] <- templist[[i]][,c("country", "year", "ideology",
                                    "samesex", "privatization",
                                    "wellbeing", "jobs", "inequality",
                                    "health", "income", "education",
                                    "occupation", "wealth", "knowledge","id")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$wealth <- jitter(as.numeric(tmpdf$wealth), factor=10e-8)
  # jitter to allow quantization
  matqnt <- as.character(qcut(tmpdf$wealth,5)) # just to get labels
  stopifnot(mean(tmpdf$wealth[which(matqnt == max(as.numeric(matqnt),
                                                  na.rm=T))]) >
              mean(tmpdf$wealth[which(matqnt == min(as.numeric(matqnt),
                                                    na.rm=T))]))
  # make sure factor levels align
  tmpdf$wealth <- matqnt
  tmpdf <- tmpdf[,c("id","wealth")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$wealth.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "wealth.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "wealth"
  templist[[i]] <- templist[[i]][,c("country", "year", "ideology",
                                    "samesex", "privatization",
                                    "wellbeing", "jobs", "inequality",
                                    "health", "income", "education",
                                    "occupation", "wealth", "knowledge")]
}
# quantize education
lapop$education <- as.numeric(lapop$education)
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5)) # just to get labels
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
    templist[[i]] <- merge(templist[[i]],tmpdf, by = "id", all.x = T, all.y = T)
    templist[[i]]$education.x <- NULL
    stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "education.y")
    colnames(templist[[i]])[ncol(templist[[i]])] <- "education"
  }
}
# quantize income
lapop$income <- as.numeric(lapop$income)
lapop$income[which(lapop$income > 10)] <- NA # no idea where these came from
for(i in 1:length(templist)){
  print(i)
  # get temp df
  tmpdf <- templist[[i]][,c("id","income")]
  if(all(is.na(unique(templist[[i]]$income))) == T){
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter
    tmpdf$income <- jitter(as.numeric(tmpdf$income), factor=10e-8)
    incqnt <- as.character(qcut(tmpdf$income,5)) # just to get labels
    stopifnot(mean(tmpdf$income[which(incqnt == max(as.numeric(incqnt),
                                                    na.rm=T))]) >
                mean(tmpdf$income[which(incqnt == min(as.numeric(incqnt),
                                                      na.rm=T))]))
    # make sure factor levels align
    tmpdf$income <- incqnt
    templist[[i]] <- merge(templist[[i]],tmpdf, by = "id", all.x = T, all.y = T)
    templist[[i]]$income.x <- NULL
    stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "income.y")
    colnames(templist[[i]])[ncol(templist[[i]])] <- "income"
  }
}
# add weights
for(i in 1:length(templist)){
  print(i)
  templist[[i]]$w <- rep(1/nrow(templist[[i]]), nrow(templist[[i]]))
}
# recombine
lapop <- do.call(rbind, templist)
# drop unnecesary years, with no non-ideology questions (note no odd years)
lapop <- lapop[which(lapop$year %in% c("2010","2012","2014")),]
# scale everything
lapop$ideology_scaled <- rescalr(lapop$ideology, 1, 10, -1, 1)
lapop$samesex_scaled <- rescalr(lapop$samesex, 1, 10, -1, 1)
lapop$privatization_scaled <- rescalr(lapop$privatization, 1, 7, -1, 1)
lapop$wellbeing_scaled <- rescalr(lapop$wellbeing, 1, 7, -1, 1)
lapop$jobs_scaled <- rescalr(lapop$jobs, 1, 7, -1, 1)
lapop$inequality_scaled <- rescalr(lapop$inequality, 1, 7, -1, 1)
lapop$health_scaled <- rescalr(lapop$health, 1, 7, -1, 1)
lapop <- lapop[order(lapop$country,lapop$year),
               c("country","year","ideology","samesex","privatization",
                 "wellbeing","jobs","inequality","health","wealth","occupation",
                 "education","income","knowledge","ideology_scaled",
                 "samesex_scaled","privatization_scaled","wellbeing_scaled",
                 "jobs_scaled","inequality_scaled","health_scaled","w")]

# checkpoint - save lapop
write.csv(lapop,"./final/final-lapop-full.csv", row.names = F)

# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final",
                           "pela","lapop","p"))])

#### now we need to factor economic indexes together
# store IDs
lapop$id <- paste0("m",as.character(seq(1,nrow(lapop),1)),sep="")
lapop$econ <- lapop$econ_scaled <- rep(NA,nrow(lapop))
pela$econ <- pela$econ_scaled <- rep(NA,nrow(pela))
for(i in 1:nrow(df.final)){
  print(i)
  # get the mass and elite data
  tmp <- rbind(lapop[which(lapop$country == df.final$country[i] &
                             lapop$year == df.final$year[i]),
                     c("country","year","id","privatization","wellbeing",
                       "jobs","health")],
               pela[which(pela$country == df.final$country[i] &
                            pela$year == df.final$year[i]),
                    c("country","year","id","privatization","wellbeing",
                      "jobs","health")])
  if(nrow(tmp) == 0){
    next
  }
  
  # remove all NA columns
  for(j in ncol(tmp):4){
    if(nrow(tmp[which(is.na(tmp[,j])),])/nrow(tmp) > .8){
      tmp <- tmp[,-j]
    }
  }
  #sometimes this removes all columns, so go to next
  if(ncol(tmp) == 3){
    next
  }
  # take only complete cases and factor
  econmat <- tmp[complete.cases(tmp),]
  # PCA
  facs <- PCA(econmat[,-c(1:3)], ncp = 5, graph = F)
  # save first factor score
  econmat$econ <- facs$ind$coord[,1]
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum, then flip axes as necessary
  for(j in 4:(ncol(econmat)-1)){
    econmat[,j] <- as.numeric(as.character(econmat[,j]))
  }
  if(ncol(econmat) > 5){
    econmat <- econmat[order(rowSums(econmat[,4:(ncol(econmat)-1)]),
                             decreasing=T),]
  } else {
    econmat <- econmat[order(econmat[,4],decreasing=T),]
  }
  if(as.logical(head(econmat$econ, 1) < tail(econmat$econ, 1)) == T){
    econmat$econ <- econmat$econ*(-1)
  }
  tmp <- econmat[,c("id","econ")]
  tmp$econ_scaled <- rescalr(tmp$econ,
                             min(tmp$econ, na.rm=T),
                             max(tmp$econ, na.rm=T),
                             -1, 1)
  # re-merge and store metadata, matching on ID
  for(j in 1:nrow(tmp)){
    pela$econ[which(pela$id == tmp$id[j])] <- tmp$econ[j]
    lapop$econ[which(lapop$id == tmp$id[j])] <- tmp$econ[j]
    pela$econ_scaled[which(pela$id == tmp$id[j])] <- tmp$econ_scaled[j]
    lapop$econ_scaled[which(lapop$id == tmp$id[j])] <- tmp$econ_scaled[j]
  }
}
# delete IDs
lapop$id <- NULL

# calculate econ weights
elist <- vector("list",nrow(df.final))
for(i in 1:length(elist)){
  elist[[i]] <- pela[which(pela$country == df.final$country[i] &
                             pela$year == df.final$year[i]),]
}
for(i in 1:length(elist)){
  ### work in a temporary environment
  tmp <- elist[[i]]
  
  ### if no data/wrong country then skip it
  if(nrow(tmp[which(!(is.na(tmp$econ))),]) == 0){
    elist[[i]] <- tmp[which(!(is.na(tmp$econ))),]
    rm(tmp)
    next
  }
  
  ### if no party or sex info, just weight everything evenly
  if(all(is.na(unique(tmp$party))) & all(is.na(unique(tmp$sex)))){
    tmp <- tmp[!is.na(tmp$econ_scaled),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(tmp)
    next
  }
  
  ### if just no party info, weight on sex only
  if(all(is.na(unique(tmp$party)))){
    tmp <- tmp[which(!(is.na(tmp$econ_scaled)) & !(is.na(tmp$sex))),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    ww <- df.final$pc_women_elite_real[i]
    mw <- 1 - ww
    men <- tmp[which(tmp$sex == "Male"),]
    women <- tmp[which(tmp$sex == "Female"),]
    men$w <- (mw/sum(men$w))*men$w
    women$w <- (ww/sum(women$w))*women$w
    tmp <- rbind(women,men)
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(ww,mw,men,women,tmp)
    next
  }
  
  ### if no sex info, weight on party only
  if(all(is.na(unique(tmp$sex)))){
    tmp <- tmp[which(!(is.na(tmp$econ_scaled)) & !(is.na(tmp$party))),]
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    ps <- sort(unique(tmp$party))
    for(j in 1:length(ps)){
      pw <- p$weight[which(p$country == df.final$country[i] &
                             p$year == df.final$year[i] &
                             p$party == ps[j])]
      stopifnot(!(is.na(pw)))
      tmp$w[which(tmp$party == ps[j])] <-
        (pw/sum(tmp$w[which(tmp$party ==
                              ps[j])]))*tmp$w[which(tmp$party == ps[j])]
    }
    stopifnot(all.equal(sum(tmp$w),1))
    tmp$w_d <- rep(NA,nrow(tmp))
    elist[[i]] <- tmp
    rm(pw,ps,tmp)
    next
  }
  
  ### if both, weight on both
  tmp <- tmp[which(!(is.na(tmp$econ_scaled)) &
                     !(is.na(tmp$sex)) & !(is.na(tmp$party))),]
  tmp$w <- rep(1/nrow(tmp),nrow(tmp))
  
  # get the gender weights
  ww <- df.final$pc_women_elite_real[i]
  mw <- 1 - ww
  # break it into party/sex binaries, then weight and recombine
  obslist <- split(tmp, f=list(tmp$sex,tmp$party))
  for(j in 1:length(obslist)){
    pw <- p$weight[which(p$country == df.final$country[i] &
                           p$year == df.final$year[i] &
                           p$party == unique(obslist[[j]]$party))]
    sw <- ifelse(unique(obslist[[j]]$sex) == "Male",mw,ww)
    W <- pw*sw
    obslist[[j]]$w <- (W/sum(obslist[[j]]$w))*obslist[[j]]$w
    rm(pw,sw,W) # so no accidental duplicates
  }
  tmp <- do.call(rbind,obslist)
  stopifnot(abs(1 - sum(tmp$w)) < .1)
  # sometimes parties only have 1 gender, in which case this is slightly off
  # not much we can do, but the size of these errors are small.
  # reweight in such cases -- always women need to be upweighted
  if(all.equal(sum(tmp$w),1) != T){
    diff <- 1-sum(tmp$w)
    tmp$w[which(tmp$sex == "Female")] <-
      (tmp$w[which(tmp$sex == "Female")] +
         diff/nrow(tmp[which(tmp$sex == "Female"),]))
    tmp$w <- tmp$w*(1/sum(tmp$w))
    stopifnot(abs(ww - sum(tmp$w[which(tmp$sex == "Female")])) < .05)
    rm(diff)
  }
  stopifnot(all.equal(sum(tmp$w),1))
  tmp$w_d <- tmp$w
  elist[[i]] <- tmp
  rm(mw,ww,tmp,obslist)
}
pela.weighted <- do.call(rbind, elist)
pela.weighted <- pela.weighted[,c("id","w","w_d")]
names(pela.weighted)[c(2:3)] <- c("econ_weight","econ_weight_d")
pela <- merge(pela,pela.weighted,by="id",all=T)

# checkpoint pela again
pela <- pela[order(pela$country,pela$year),
             c("country", "year", "survey", "ideology", "samesex", "econ",
               "privatization", "wellbeing", "jobs", "inequality", "health",
               "ideology_scaled", "samesex_scaled", "econ_scaled",
               "privatization_scaled", "wellbeing_scaled", "jobs_scaled",
               "inequality_scaled", "health_scaled", "party", "sex",
               "ideology_weight", "samesex_weight", "econ_weight",
               "ideology_weight_d", "samesex_weight_d", "econ_weight_d")]

write.csv(pela,"./final/final-pela-weighted.csv", row.names = F)
rm(p,elist,pela.weighted)

### now let's do some EMDs
# set up temp storage
mlist <- as.list(NULL)
elist <- as.list(NULL)
# break out the data
for(i in 1:nrow(df.final)){
  print(i)
  mlist[[i]] <- lapop[which(lapop$country == df.final$country[i] &
                              lapop$year == df.final$year[i]),]
  elist[[i]] <- pela[which(pela$country == df.final$country[i] &
                             pela$year == df.final$year[i]),]
}

### run the loop -- these will skip a lot, but take longer when it actually has
# to calculate the EMD
df.final$emd_samesex <-
  df.final$emd_lessaffluent_samesex <-
  df.final$emd_midaffluent_samesex <-
  df.final$emd_moreaffluent_samesex <-
  df.final$emd_lessaffluent_loknow_samesex <-
  df.final$emd_midaffluent_loknow_samesex <-
  df.final$emd_moreaffluent_loknow_samesex <-
  df.final$emd_lessaffluent_hiknow_samesex <-
  df.final$emd_midaffluent_hiknow_samesex <-
  df.final$emd_moreaffluent_hiknow_samesex <-
  df.final$emd_econ <-
  df.final$emd_lessaffluent_econ <-
  df.final$emd_midaffluent_econ <-
  df.final$emd_moreaffluent_econ <-
  df.final$emd_lessaffluent_loknow_econ <-
  df.final$emd_midaffluent_loknow_econ <-
  df.final$emd_moreaffluent_loknow_econ <-
  df.final$emd_lessaffluent_hiknow_econ <-
  df.final$emd_midaffluent_hiknow_econ <-
  df.final$emd_moreaffluent_hiknow_econ <-
  df.final$emd_samesex_d <-
  df.final$emd_lessaffluent_samesex_d <-
  df.final$emd_midaffluent_samesex_d <-
  df.final$emd_moreaffluent_samesex_d <-
  df.final$emd_lessaffluent_loknow_samesex_d <-
  df.final$emd_midaffluent_loknow_samesex_d <-
  df.final$emd_moreaffluent_loknow_samesex_d <-
  df.final$emd_lessaffluent_hiknow_samesex_d <-
  df.final$emd_midaffluent_hiknow_samesex_d <-
  df.final$emd_moreaffluent_hiknow_samesex_d <-
  df.final$emd_econ_d <-
  df.final$emd_lessaffluent_econ_d <-
  df.final$emd_midaffluent_econ_d <-
  df.final$emd_moreaffluent_econ_d <-
  df.final$emd_lessaffluent_loknow_econ_d <-
  df.final$emd_midaffluent_loknow_econ_d <-
  df.final$emd_moreaffluent_loknow_econ_d <-
  df.final$emd_lessaffluent_hiknow_econ_d <-
  df.final$emd_midaffluent_hiknow_econ_d <-
  df.final$emd_moreaffluent_hiknow_econ_d <-
  rep(NA,nrow(df.final))
pb <- txtProgressBar(min = 1, max = nrow(df.final), style = 3)
for(i in 1:nrow(df.final)){
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  
  if(nrow(edata[which(!is.na(edata$ideology)),]) == 0 |
     nrow(mdata[which(!is.na(mdata$ideology)),]) == 0){
    next
  } else {
    # create a temporary data frame of class measures, ranked by our preference
    class <- data.frame(cbind(mdata$country, mdata$wealth, mdata$income,
                              mdata$occupation))
    colnames(class) <- c("country","wealth","income","occupation")
    # delete all NAs
    for(j in ncol(class):2){
      if(all(is.na(unique(class[,j]))) == T){
        class <- class[,-j]
      }
    }
    # make sure the loop uses our best class measure
    whichclass <- colnames(class)[2]
    colnames(mdata)[grep(whichclass, names(mdata))] <- "class"
    
    # create temporary data for knowledge
    info <- data.frame(cbind(mdata$country, mdata$knowledge, mdata$education))
    colnames(info) <- c("country","knowledge","education")
    for(j in ncol(info):2){
      if(all(is.na(unique(info[,j]))) == T){
        info <- info[,-j]
      }
    }
    # make sure the loop uses our best knowledge measure
    whichinfo <- colnames(info)[2]
    colnames(mdata)[grep(whichinfo, names(mdata))] <- "know"
    
    ### Baseline congruence
    ### Samesex marriage
    y <- as.matrix(na.omit(edata$samesex_scaled[which(!(is.na(edata$samesex_weight)))]))
    w.y <- edata$samesex_weight[!is.na(edata$samesex_weight)]
    x <- as.matrix(na.omit(mdata$samesex_scaled))
    w.x <- as.matrix(rep(1/nrow(data.frame(na.omit(mdata$samesex_scaled))),
                         nrow(data.frame(na.omit(mdata$samesex_scaled)))))
    df.final$emd_samesex[which(df.final$country == unique(mdata$country) &
                                 df.final$year == unique(mdata$year))] <-
      emdw(x,w.x,y,w.y,max.iter = 100000)
    # dropped weights where no poststratification
    y <- as.matrix(na.omit(edata$samesex_scaled[which(!(is.na(edata$samesex_weight_d)))]))
    w.y <- edata$samesex_weight_d[!is.na(edata$samesex_weight_d)]
    df.final$emd_samesex_d[which(df.final$country == unique(mdata$country) &
                                   df.final$year == unique(mdata$year))] <-
      emdw(x,w.x,y,w.y,max.iter = 100000)
    
    ### Econ
    y <- as.matrix(na.omit(edata$econ_scaled[which(!(is.na(edata$econ_weight)))]))
    w.y <- edata$econ_weight[!is.na(edata$econ_weight)]
    x <- as.matrix(na.omit(mdata$econ_scaled))
    w.x <- as.matrix(rep(1/nrow(data.frame(na.omit(mdata$econ_scaled))),
                         nrow(data.frame(na.omit(mdata$econ_scaled)))))
    df.final$emd_econ[which(df.final$country == unique(mdata$country) &
                              df.final$year == unique(mdata$year))] <-
      emdw(x,w.x,y,w.y,max.iter = 100000)
    # dropped weights where no poststratification
    y <- as.matrix(na.omit(edata$econ_scaled[which(!(is.na(edata$econ_weight_d)))]))
    w.y <- edata$econ_weight_d[!is.na(edata$econ_weight_d)]
    df.final$emd_econ_d[which(df.final$country == unique(mdata$country) &
                                df.final$year == unique(mdata$year))] <-
      emdw(x,w.x,y,w.y,max.iter = 100000)
    
    #### Congruence by class
    ### Samesex marriage
    y <- as.matrix(na.omit(edata$samesex_scaled[which(!(is.na(edata$samesex_weight)))]))
    w.y <- edata$samesex_weight[!is.na(edata$samesex_weight)]
    x.lessaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "0")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "0")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "0")])))))
    x.midaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "2")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "2")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "2")])))))
    x.moreaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "4")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "4")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "4")])))))
    df.final$emd_lessaffluent_samesex[which(df.final$country ==
                                              unique(mdata$country) &
                                              df.final$year ==
                                              unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_samesex[which(df.final$country ==
                                             unique(mdata$country) &
                                             df.final$year ==
                                             unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_samesex[which(df.final$country ==
                                              unique(mdata$country) &
                                              df.final$year ==
                                              unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    # dropped weights where no poststratification
    y <- as.matrix(na.omit(edata$samesex_scaled[which(!(is.na(edata$samesex_weight_d)))]))
    w.y <- edata$samesex_weight_d[!is.na(edata$samesex_weight_d)]
    df.final$emd_lessaffluent_samesex_d[which(df.final$country ==
                                                unique(mdata$country) &
                                                df.final$year ==
                                                unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_samesex_d[which(df.final$country ==
                                               unique(mdata$country) &
                                               df.final$year ==
                                               unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_samesex_d[which(df.final$country ==
                                                unique(mdata$country) &
                                                df.final$year ==
                                                unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    
    ### factored econ
    y <- as.matrix(na.omit(edata$econ_scaled[which(!(is.na(edata$econ_weight)))]))
    w.y <- edata$econ_weight[!is.na(edata$econ_weight)]
    x.lessaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "0")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "0")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "0")])))))
    x.midaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "2")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "2")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "2")])))))
    x.moreaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "4")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "4")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "4")])))))
    df.final$emd_lessaffluent_econ[which(df.final$country ==
                                           unique(mdata$country) &
                                           df.final$year ==
                                           unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_econ[which(df.final$country ==
                                          unique(mdata$country) &
                                          df.final$year ==
                                          unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_econ[which(df.final$country ==
                                           unique(mdata$country) &
                                           df.final$year ==
                                           unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    # dropped weights where no poststratification
    y <- as.matrix(na.omit(edata$econ_scaled[which(!(is.na(edata$econ_weight_d)))]))
    w.y <- edata$econ_weight_d[!is.na(edata$econ_weight_d)]
    df.final$emd_lessaffluent_econ_d[which(df.final$country ==
                                             unique(mdata$country) &
                                             df.final$year ==
                                             unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_econ_d[which(df.final$country ==
                                            unique(mdata$country) &
                                            df.final$year ==
                                            unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_econ_d[which(df.final$country ==
                                             unique(mdata$country) &
                                             df.final$year ==
                                             unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    
    
    #### Congruence by class, lo information
    ### Samesex marriage
    y <- as.matrix(na.omit(edata$samesex_scaled[which(!(is.na(edata$samesex_weight)))]))
    w.y <- edata$samesex_weight[!is.na(edata$samesex_weight)]
    x.lessaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "0" & mdata$know == "0")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "0"& mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "0"& mdata$know == "0")])))))
    x.midaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "2" & mdata$know == "0")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "2"& mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "2"& mdata$know == "0")])))))
    x.moreaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "4"& mdata$know == "0")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "4"& mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "4"& mdata$know == "0")])))))
    df.final$emd_lessaffluent_loknow_samesex[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year ==
                                                     unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_loknow_samesex[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year ==
                                                    unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_loknow_samesex[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year ==
                                                     unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    # dropped weights where no poststratification
    y <- as.matrix(na.omit(edata$samesex_scaled[which(!(is.na(edata$samesex_weight_d)))]))
    w.y <- edata$samesex_weight_d[!is.na(edata$samesex_weight_d)]
    df.final$emd_lessaffluent_loknow_samesex_d[which(df.final$country ==
                                                       unique(mdata$country) &
                                                       df.final$year ==
                                                       unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_loknow_samesex_d[which(df.final$country ==
                                                      unique(mdata$country) &
                                                      df.final$year ==
                                                      unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_loknow_samesex_d[which(df.final$country ==
                                                       unique(mdata$country) &
                                                       df.final$year ==
                                                       unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    
    ### Econ
    y <- as.matrix(na.omit(edata$econ_scaled[which(!(is.na(edata$econ_weight)))]))
    w.y <- edata$econ_weight[!is.na(edata$econ_weight)]
    x.lessaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "0" & mdata$know == "0")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "0"& mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "0"& mdata$know == "0")])))))
    x.midaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "2" & mdata$know == "0")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "2"& mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "2"& mdata$know == "0")])))))
    x.moreaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "4"& mdata$know == "0")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "4"& mdata$know == "0")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "4"& mdata$know == "0")])))))
    df.final$emd_lessaffluent_loknow_econ[which(df.final$country ==
                                                  unique(mdata$country) &
                                                  df.final$year ==
                                                  unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_loknow_econ[which(df.final$country ==
                                                 unique(mdata$country) &
                                                 df.final$year ==
                                                 unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_loknow_econ[which(df.final$country ==
                                                  unique(mdata$country) &
                                                  df.final$year ==
                                                  unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    # dropped weights where no poststratification
    y <- as.matrix(na.omit(edata$econ_scaled[which(!(is.na(edata$econ_weight_d)))]))
    w.y <- edata$econ_weight_d[!is.na(edata$econ_weight_d)]
    df.final$emd_lessaffluent_loknow_econ_d[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year ==
                                                    unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_loknow_econ_d[which(df.final$country ==
                                                   unique(mdata$country) &
                                                   df.final$year ==
                                                   unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_loknow_econ_d[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year ==
                                                    unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    
    
    #### Congruence by class, hi information
    ### Samesex marriage
    y <- as.matrix(na.omit(edata$samesex_scaled[which(!(is.na(edata$samesex_weight)))]))
    w.y <- edata$samesex_weight[!is.na(edata$samesex_weight)]
    x.lessaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "0" & mdata$know == "4")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "0"& mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "0"& mdata$know == "4")])))))
    x.midaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "2" & mdata$know == "4")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "2"& mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "2"& mdata$know == "4")])))))
    x.moreaffluent <- as.matrix(na.omit(
      mdata$samesex_scaled[which(mdata$class == "4"& mdata$know == "4")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$samesex_scaled[which(mdata$class == "4"& mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$samesex_scaled[which(mdata$class == "4"& mdata$know == "4")])))))
    df.final$emd_lessaffluent_hiknow_samesex[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year ==
                                                     unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_hiknow_samesex[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year ==
                                                    unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_hiknow_samesex[which(df.final$country ==
                                                     unique(mdata$country) &
                                                     df.final$year ==
                                                     unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    # dropped weights where no poststratification
    y <- as.matrix(na.omit(edata$samesex_scaled[which(!(is.na(edata$samesex_weight_d)))]))
    w.y <- edata$samesex_weight_d[!is.na(edata$samesex_weight_d)]
    df.final$emd_lessaffluent_hiknow_samesex_d[which(df.final$country ==
                                                       unique(mdata$country) &
                                                       df.final$year ==
                                                       unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_hiknow_samesex_d[which(df.final$country ==
                                                      unique(mdata$country) &
                                                      df.final$year ==
                                                      unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_hiknow_samesex_d[which(df.final$country ==
                                                       unique(mdata$country) &
                                                       df.final$year ==
                                                       unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    
    ### econ marriage
    y <- as.matrix(na.omit(edata$econ_scaled[which(!(is.na(edata$econ_weight)))]))
    w.y <- edata$econ_weight[!is.na(edata$econ_weight)]
    x.lessaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "0" & mdata$know == "4")]))
    w.x.lessaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "0"& mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "0"& mdata$know == "4")])))))
    x.midaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "2" & mdata$know == "4")]))
    w.x.midaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "2"& mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "2"& mdata$know == "4")])))))
    x.moreaffluent <- as.matrix(na.omit(
      mdata$econ_scaled[which(mdata$class == "4"& mdata$know == "4")]))
    w.x.moreaffluent <- as.matrix(rep(1/nrow(data.frame(na.omit(
      mdata$econ_scaled[which(mdata$class == "4"& mdata$know == "4")]))),
      nrow(data.frame(na.omit(
        mdata$econ_scaled[which(mdata$class == "4"& mdata$know == "4")])))))
    df.final$emd_lessaffluent_hiknow_econ[which(df.final$country ==
                                                  unique(mdata$country) &
                                                  df.final$year ==
                                                  unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_hiknow_econ[which(df.final$country ==
                                                 unique(mdata$country) &
                                                 df.final$year ==
                                                 unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_hiknow_econ[which(df.final$country ==
                                                  unique(mdata$country) &
                                                  df.final$year ==
                                                  unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
    # dropped weights where no poststratification
    y <- as.matrix(na.omit(edata$econ_scaled[which(!(is.na(edata$econ_weight_d)))]))
    w.y <- edata$econ_weight_d[!is.na(edata$econ_weight_d)]
    df.final$emd_lessaffluent_hiknow_econ_d[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year ==
                                                    unique(mdata$year))] <-
      emdw(x.lessaffluent,w.x.lessaffluent,y,w.y,max.iter = 100000)
    df.final$emd_midaffluent_hiknow_econ_d[which(df.final$country ==
                                                   unique(mdata$country) &
                                                   df.final$year ==
                                                   unique(mdata$year))] <-
      emdw(x.midaffluent,w.x.midaffluent,y,w.y,max.iter = 100000)
    df.final$emd_moreaffluent_hiknow_econ_d[which(df.final$country ==
                                                    unique(mdata$country) &
                                                    df.final$year ==
                                                    unique(mdata$year))] <-
      emdw(x.moreaffluent,w.x.moreaffluent,y,w.y,max.iter = 100000)
  }
  setTxtProgressBar(pb, i)
}
df.final[is.na(df.final)] <- NA
# and finally fix just a few names
df.final$country[which(df.final$country == "Slovak Republic")] <- "Slovakia"
df.final$country[which(df.final$country == "Venezuela, RB")] <- "Venezuela"
# store all of the data
write.csv(pela,"./final/pela-full.csv", row.names = F)
write.csv(lapop,"./final/lapop-full.csv", row.names = F)
write.csv(df.final, "./final/final-many-to-many.csv", row.names = F)
# clean up
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final",
                           "pela","lapop"))])

#### finally, let's output dyads
lapop$mass_id <- seq(1,nrow(lapop),1)
pela$elite_id <- seq(1,nrow(pela),1)
dyads <- data.frame(matrix(ncol=(ncol(lapop)+ncol(pela)),nrow=0))
colnames(dyads) <- c(paste("m",colnames(lapop),sep="_"),
                     paste("e",colnames(pela),sep="_"))
colnames(lapop) <- c(paste("m",colnames(lapop),sep="_"))
colnames(pela) <- c(paste("e",colnames(pela),sep="_"))
# set up temp storage
mlist <- as.list(NULL)
elist <- as.list(NULL)
for(i in 1:nrow(df.final)){
  print(i)
  mlist[[i]] <- lapop[which(lapop$m_country == df.final$country[i] &
                              lapop$m_year == df.final$year[i]),]
  elist[[i]] <- pela[which(pela$e_country == df.final$country[i] &
                             pela$e_year ==df.final$year[i]),]
}
# create all dyads within matching country-years
for(i in 1:length(mlist)){
  print(i)
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  
  if(nrow(edata) == 0 | nrow(mdata) == 0){
    next
  } else {
    dyads <- rbind(dyads,expand.grid.df(mdata,edata))
  }
}
# create some DVs we care about
dyads$emd_ideology <- abs(dyads$m_ideology_scaled - dyads$e_ideology_scaled)
dyads$emd_samesex <- abs(dyads$m_samesex_scaled - dyads$e_samesex_scaled)
dyads$emd_econ <- abs(dyads$m_econ_scaled - dyads$e_econ_scaled)
# direction
dyads$diff_ideology <- dyads$m_ideology_scaled - dyads$e_ideology_scaled
dyads$diff_samesex <- dyads$m_samesex_scaled - dyads$e_samesex_scaled
dyads$diff_econ <- dyads$m_econ_scaled - dyads$e_econ_scaled
# combined weights - x100 otherwise they show up as NA
dyads$comb_weight_econ <- dyads$m_w*dyads$e_econ_weight*100
dyads$comb_weight_ideology <- dyads$m_w*dyads$e_ideology_weight*100
dyads$comb_weight_samesex <- dyads$m_w*dyads$e_samesex_weight*100
# export
write.csv(dyads, "./final/latam-dyads.csv",row.names=F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

##### GET THE AFRICA ISSUES DATA #####
afr <- read.spss("./Afrobarometer/merged_r4_data.sav", use.value.labels = TRUE,
                 use.missings = TRUE, to.data.frame = TRUE)
afrel <- read.csv("./Clayton-et-al/ALPdata.csv", stringsAsFactors = F)
# clean the elite data first since it's easier
afrel$country <- afrel$Country
afrel$year <- rep("2008",nrow(afrel))
afrel$survey <- rep("ALP",nrow(afrel))
afrel <- afrel[,c("country", "year", "survey", "PovertyAgg", "AgricultureAgg",
                  "EconomyAgg", "EducationAgg", "HealthAgg", "InfrastructureAgg",
                  "SocialRightsAgg", "PoliticalRightsAgg", "ViolenceAgg", 
                  "WaterAgg", "WomenRightsAgg")]
names(afrel)[4:14] <- c("poverty","agriculture","economy","education","health",
                        "infrastructure","social_rights","political_rights",
                        "violence","water","womens_rights")
### make the elite options binary
for(i in c(4:14)){
  afrel[which(afrel[,i] > 1),i] <- 1
}
# rescale to -1, 1
numerics <- which(!(names(afrel) %in% c("country", "year", "survey")))
for(i in numerics){
  afrel[,i] <- rescalr(afrel[,i],
                       min(afrel[,i], na.rm=T),
                       max(afrel[,i], na.rm=T),
                       -1, 1)
}
### save the cleaned version
write.csv(afrel,"./final/final-africa-elite.csv", row.names = F)

### trim mass data and make numeric
afr <- afr[,c("COUNTRY","Q92A","Q92B","Q92C","Q56PT1","Q56PT2","Q56PT3")]
# country, own radio, own TV, own car/motorcycle,m_imp1,m_imp2,m_imp3
names(afr) <- c("country","mat1","mat2","mat3","prob1","prob2","prob3")
afr$year <- rep("2008",nrow(afr))
for(i in 1:ncol(afr)){
  afr[,i] <- as.character(afr[,i])
}
# clean the material wealth variables
afr$mat1[which(afr$mat1 %in% c("Don't know", "Missing"))] <- NA
afr$mat2[which(afr$mat2 %in% c("Don't know", "Missing"))] <- NA
afr$mat3[which(afr$mat3 %in% c("Don't know", "Missing"))] <- NA
afr$mat1[which(afr$mat1 == "Yes, do own")] <- "1"
afr$mat1[which(afr$mat1 == "No, don't own")] <- "0"
afr$mat2[which(afr$mat2 == "Yes, do own")] <- "1"
afr$mat2[which(afr$mat2 == "No, don't own")] <- "0"
afr$mat3[which(afr$mat3 == "Yes, do own")] <- "1"
afr$mat3[which(afr$mat3 == "No, don't own")] <- "0"
# factor material wealth
templist <- split(afr, f = list(afr$country, afr$year))
templist <- templist[lapply(templist,nrow)>0]
numerics <- which(grepl("mat", colnames(templist[[1]])))
for(i in 1:length(templist)){
  print(i)
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  matgoods <- cbind(templist[[i]]$id,templist[[i]][,numerics])
  colnames(matgoods)[1] <- "id"
  # remove all NA columns
  for(j in ncol(matgoods):2){                          # 2 since column 1 is IDs
    if(all(is.na(unique(matgoods[,j]))) == T){
      matgoods <- matgoods[,-j]
    }
  }
  # take only complete cases and factor
  matgoods <- matgoods[complete.cases(matgoods),]
  facs <- MCA(matgoods[,-1], ncp = 5, graph = F)
  # save first factor score
  matgoods$wealth <- facs$ind$coord[,1]
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(matgoods)-1)){
    matgoods[,j] <- as.numeric(as.character(matgoods[,j]))
  }
  matgoods <- matgoods[order(rowSums(matgoods[,2:(ncol(matgoods)-1)]),
                             decreasing=T),]
  if(as.logical(head(matgoods$wealth, 1) < tail(matgoods$wealth, 1)) == T){
    matgoods$wealth <- matgoods$wealth*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],matgoods, by = "id", all.x = T,
                         all.y = T)
  templist[[i]] <- templist[[i]][,c("country", "year", "prob1",
                                    "prob2", "prob3","wealth","id")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$wealth <- jitter(as.numeric(tmpdf$wealth), factor=10e-8)
  # jitter to allow quantization
  matqnt <- as.character(qcut(tmpdf$wealth,5)) # just to get labels
  stopifnot(mean(tmpdf$wealth[which(matqnt == max(as.numeric(matqnt),
                                                  na.rm=T))]) >
              mean(tmpdf$wealth[which(matqnt == min(as.numeric(matqnt),
                                                    na.rm=T))]))
  # make sure factor levels align
  tmpdf$wealth <- matqnt
  tmpdf <- tmpdf[,c("id","wealth")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$wealth.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "wealth.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "wealth"
  templist[[i]] <- templist[[i]][,c("country", "year", "prob1",
                                    "prob2", "prob3","wealth")]
}
afr <- do.call(rbind, templist)
row.names(afr) <- NULL
### clean the issues
numerics <- which(grepl("prob", colnames(afr)))
for(i in numerics){
  afr[which(afr[,i] == "Don't know"),i] <- NA
  afr[which(afr[,i] == "Missing"),i] <- NA
  afr[which(afr[,i] == "No further reply"),i] <- NA
  afr[which(afr[,i] == "Nothing/No problems"),i] <- NA
  afr[which(afr[,i] == "Other"),i] <- NA
}

### make the general issue codings, following Clayton et al and Gottlieb et al in turn
afr$poverty <- afr$agriculture <- afr$economy <- afr$education <- afr$health <-
  afr$infrastructure <- afr$social_rights <- afr$political_rights <- afr$violence <-
  afr$water <- afr$womens_rights <- rep(0,nrow(afr))
for(i in numerics){
  for(j in 1:nrow(afr)){
    afr$poverty[j] <- ifelse(afr[j,i] %in% c("Poverty/ destitution", 
                                             "Food shortage/ famine", 
                                             "Orphans/ street children/ homeless"),
                             1,afr$poverty[j])
    afr$agriculture[j] <- ifelse(afr[j,i] %in% c("Agricultural Marketing", 
                                                 "Farm Inputs", 
                                                 "Drought", 
                                                 "Barrage", 
                                                 "Farming/ agriculture", "Land"),
                                 1,afr$agriculture[j])
    afr$economy[j] <- ifelse(afr[j,i] %in% c("Rates and taxes", "Unemployment", 
                                             "Wages, incomes and salaries", 
                                             "Loans/ credit", 
                                             "Management of economy"),
                             1,afr$economy[j])
    afr$education[j] <- ifelse(afr[j,i] %in% c("Education"),
                               1,afr$education[j])
    afr$health[j] <- ifelse(afr[j,i] %in% c("AIDS", "Health", 
                                            "Sickness/ disease"),
                            1,afr$health[j])
    afr$infrastructure[j] <- ifelse(afr[j,i] %in% c("Communications", 
                                                    "Electricity", 
                                                    "Housing",
                                                    "Transportation",
                                                    "Infrastructure/ roads"),
                                    1,afr$infrastructure[j])
    afr$social_rights[j] <- ifelse(afr[j,i] %in% c("Discrimination/ inequality"),
                                   1,afr$social_rights[j])
    afr$political_rights[j] <- ifelse(afr[j,i] %in% c("Corruption", 
                                                      "Democracy/ political rights"),
                                      1,afr$political_rights[j])
    afr$violence[j] <- ifelse(afr[j,i] %in% c("Civil war", "Crime and security",
                                              "Political instability/ ethnic tensions",
                                              "Political violence", 
                                              "Emigration",
                                              "War (international)"),
                              1,afr$violence[j])
    afr$water[j] <- ifelse(afr[j,i] %in% c("Water supply"),
                           1,afr$water[j])
    afr$womens_rights[j] <- ifelse(afr[j,i] %in% c("Gender issues/ women's rights"),
                                   1,afr$womens_rights[j])
  }
}
# adjust for NAs
for(i in 1:nrow(afr)){
  if(is.na(afr$prob1[i]) & is.na(afr$prob2[i]) & is.na(afr$prob3[i])){
    afr$poverty[i] <- afr$agriculture[i] <- afr$economy[i] <- afr$education[i] <- 
      afr$health[i] <- afr$infrastructure[i] <- afr$social_rights[i] <- 
      afr$political_rights[i] <- afr$violence[i] <- afr$water[i] <- 
      afr$womens_rights[i] <- NA
  }
}
# remove the problem questions
afr$prob1 <- afr$prob2 <- afr$prob3 <- NULL
# now rescale -- could've done it above but will do it here just for clarity
numerics <- which(!(names(afr) %in% c("country", "year", "wealth")))
for(i in numerics){
  afr[,i] <- rescalr(afr[,i],
                     min(afr[,i], na.rm=T),
                     max(afr[,i], na.rm=T),
                     -1, 1)
}
### since it's the same set of citizens who are NA on all issues, remove them
afr <- afr[which(!(is.na(afr$poverty))),]

### save the cleaned version
write.csv(afr,"./final/final-africa-mass.csv", row.names = F)

#### finally, let's output dyads
afr$mass_id <- seq(1,nrow(afr),1)
afrel$elite_id <- seq(1,nrow(afrel),1)
dyads <- data.frame(matrix(ncol=(ncol(afr)+ncol(afrel)),nrow=0))
colnames(dyads) <- c(paste("m",colnames(afr),sep="_"),
                     paste("e",colnames(afrel),sep="_"))
colnames(afr) <- c(paste("m",colnames(afr),sep="_"))
colnames(afrel) <- c(paste("e",colnames(afrel),sep="_"))
# set up temp storage
mlist <- as.list(NULL)
elist <- as.list(NULL)
dd <- unique(afrel[c("e_country", "e_year")])
for(i in 1:nrow(dd)){
  print(i)
  mlist[[i]] <- afr[which(afr$m_country == dd$e_country[i] &
                            afr$m_year == dd$e_year[i]),]
  elist[[i]] <- afrel[which(afrel$e_country == dd$e_country[i] &
                              afrel$e_year == dd$e_year[i]),]
}
# create all dyads within matching country-years
for(i in 1:length(mlist)){
  print(i)
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  
  if(nrow(edata) == 0 | nrow(mdata) == 0){
    next
  } else {
    dyads <- rbind(dyads,expand.grid.df(mdata,edata))
  }
}
# create some DVs we care about
dyads$emd_poverty <- abs(dyads$m_poverty - dyads$e_poverty)
dyads$emd_agriculture <- abs(dyads$m_agriculture - dyads$e_agriculture)
dyads$emd_economy <- abs(dyads$m_economy - dyads$e_economy)
dyads$emd_education <- abs(dyads$m_education - dyads$e_education)
dyads$emd_health <- abs(dyads$m_health - dyads$e_health)
dyads$emd_infrastructure <- abs(dyads$m_infrastructure - dyads$e_infrastructure)
dyads$emd_social_rights <- abs(dyads$m_social_rights - dyads$e_social_rights)
dyads$emd_political_rights <- abs(dyads$m_political_rights - dyads$e_political_rights)
dyads$emd_violence <- abs(dyads$m_violence - dyads$e_violence)
dyads$emd_water <- abs(dyads$m_water - dyads$e_water)
dyads$emd_womens_rights <- abs(dyads$m_womens_rights - dyads$e_womens_rights)
dyads$emd_all <- rowMeans(dyads[,c(31:41)])
# direction
dyads$diff_poverty <- dyads$m_poverty - dyads$e_poverty
dyads$diff_agriculture <- dyads$m_agriculture - dyads$e_agriculture
dyads$diff_economy <- dyads$m_economy - dyads$e_economy
dyads$diff_education <- dyads$m_education - dyads$e_education
dyads$diff_health <- dyads$m_health - dyads$e_health
dyads$diff_infrastructure <- dyads$m_infrastructure - dyads$e_infrastructure
dyads$diff_social_rights <- dyads$m_social_rights - dyads$e_social_rights
dyads$diff_political_rights <- dyads$m_political_rights - dyads$e_political_rights
dyads$diff_violence <- dyads$m_violence - dyads$e_violence
dyads$diff_water <- dyads$m_water - dyads$e_water
dyads$diff_womens_rights <- dyads$m_womens_rights - dyads$e_womens_rights
# export
write.csv(dyads, "./final/africa-dyads.csv",row.names=F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

##### GET THE SWEDEN ISSUES DATA #####
### read in elite data and clean
swel <- read.spss("./Sweden RDU surveys/Super data set for Noam Lupu 2018.sav", 
                  use.value.labels = TRUE, use.missings = TRUE, 
                  to.data.frame = TRUE)
names(swel) <- c("public_sector", "defense_spending", "health_private",
                 "pornography", "working_day", "reduce_inequality", 
                 "ban_driving", "fewer_refugees", "nato", "ideology",
                 "occupation", "sex", "party", "year")
swel$occupation <- NULL
for(i in 1:ncol(swel)){
  swel[,i] <- as.character(swel[,i])
}
# recode policy questions
for(i in 1:9){
  swel[which(swel[,i] == "Mycket bra förslag"),i] <- "4"
  swel[which(swel[,i] == "Ganska bra förslag"),i] <- "3"
  swel[which(swel[,i] == "Varken bra eller dåligt förslag"),i] <- "2"
  swel[which(swel[,i] == "Ganska dåligt förslag"),i] <- "1"
  swel[which(swel[,i] == "Mycket dåligt förslag"),i] <- "0"
}
for(i in 1:9){
  swel[,i] <- as.numeric(swel[,i])
}
# make "0" the "left" position
swel$public_sector <- -1*(swel$public_sector - 4)
swel$defense_spending <- -1*(swel$defense_spending - 4)
swel$working_day <- -1*(swel$working_day - 4)
swel$reduce_inequality <- -1*(swel$reduce_inequality - 4)
swel$ban_driving <- -1*(swel$ban_driving - 4)
# clean ideology
swel$ideology <- gsub("[^\\d]+", "", swel$ideology, perl=TRUE)
# clean gender
swel$sex[which(swel$sex == "Man")] <- "Male"
swel$sex[which(swel$sex == "Woman")] <- "Female"
# drop 1991 since no data
swel <- swel[-which(swel$year == "1991"),]
# create panel
swel$leg.begin <- swel$year
swel$leg.end <- rep(NA,nrow(swel))
swel$leg.end[which(swel$leg.begin == "1985")] <- "1987"
swel$leg.end[which(swel$leg.begin == "1988")] <- "1991" #since 1991 data missing
swel$leg.end[which(swel$leg.begin == "1994")] <- "1995"
swel$leg.end[which(swel$leg.begin == "1996")] <- "1997"
swel$leg.end[which(swel$leg.begin == "1998")] <- "2001"
swel$leg.end[which(swel$leg.begin == "2002")] <- "2005"
swel$leg.end[which(swel$leg.begin == "2006")] <- "2009"
swel$leg.end[which(swel$leg.begin == "2010")] <- "2013"
swel$leg.end[which(swel$leg.begin == "2014")] <- "2017"
swel.full <- data.frame(matrix(nrow=0,ncol=13))
colnames(swel.full) <-  names(swel)[1:13]
for(i in 1:nrow(swel)){
  tmp <- swel[i,]
  years <- as.vector(t(expand.grid(year = tmp$leg.begin:tmp$leg.end)))
  tmp <- tmp[rep(seq_len(nrow(tmp)), each=length(years)),]
  tmp$year <- years
  stopifnot(names(tmp)[1:13] == names(swel.full))
  tmp <- tmp[,1:13]
  swel.full <- rbind(swel.full,tmp)
}
swel <- swel.full; rm(swel.full)
# clean up
swel$country <- rep("Sweden", nrow(swel))
swel$ideology <- as.numeric(swel$ideology); swel$year <- as.character(swel$year)
swel <- swel[order(swel$country,swel$year),
             c("country","year","party","sex","ideology","public_sector",
               "defense_spending","health_private","pornography","working_day",
               "reduce_inequality","ban_driving","fewer_refugees","nato")]
# checkpoint
write.csv(swel, "./final/final-sweden-elite.csv", row.names = F)

### create weights
# first gender. we have the population in most cases so just take the empirical proportion
w <- aggregate(as.numeric(factor(sex, labels = c("0","1"), 
                                 levels = c("Male","Female"))) ~ year, 
               data = swel, FUN = mean)
names(w)[2] <- "sex"
w$sex <- (w$sex - 1)
# get the real proportion for years we don't have the population
ww <- WDI(country = "SE", indicator = "SG.GEN.PARL.ZS", start = 2002, end = 2009)
ww <- rev(ww$SG.GEN.PARL.ZS)/100
w$sex[(which(w$year == 2002):which(w$year == 2009))] <- ww
rm(ww)
# second party. import
p <- read.csv("./partyweights/partyweights.csv", stringsAsFactors = F)

### get set up to loop over and create weights
swel$id <- paste0("e",as.character(seq(1,nrow(swel),1)),sep="")
elist <- vector("list",length(unique(swel$year)))
for(i in 1:length(elist)){
  elist[[i]] <- swel[which(swel$year == sort(unique(swel$year))[i]),]
}
kk <- sort(which(names(swel) %in% 
                   c( "ideology", "public_sector", "defense_spending", 
                      "health_private", "pornography", "working_day",
                      "reduce_inequality", "ban_driving", "fewer_refugees", 
                      "nato")))

### calculate weights
for(i in 1:length(elist)){
  for(k in kk){
    ### work in a temporary environment
    tmp <- elist[[i]]
    
    ### if no issue k, skip it
    eval(parse(text=paste("nn <- nrow(tmp[which(!(is.na(tmp$", names(swel)[k],
                          "))),])",sep="")))
    if(nn == 0){
      eval(parse(text=paste("tmp$w_", names(swel)[k], " <- rep(NA,nrow(tmp))",sep="")))
      eval(parse(text=paste("tmp <- tmp[,c('id','w_",names(swel)[k],"')]",sep="")))
      elist[[i]] <- merge(elist[[i]], tmp, by = "id", all.x = T, all.y = T)
      rm(tmp)
      next
    }
    
    ### if question was asked, weight by both sex and party
    eval(parse(text=paste("tmp <- tmp[which(!(is.na(tmp$",names(swel)[k],
                          ")) & !(is.na(tmp$sex)) & !(is.na(tmp$party))),]",sep="")))
    tmp$w <- rep(1/nrow(tmp),nrow(tmp))
    
    # get the gender weights
    ww <- w$sex[i]
    mw <- 1 - ww
    # break it into party/sex binaries, then weight and recombine
    obslist <- split(tmp, f=list(tmp$sex,tmp$party))
    for(j in 1:length(obslist)){
      pw <- p$weight[which(p$country == "Sweden" &
                             p$year == unique(elist[[i]]$year) &
                             p$party == unique(obslist[[j]]$party))]
      sw <- ifelse(unique(obslist[[j]]$sex) == "Male",mw,ww)
      W <- pw*sw
      obslist[[j]]$w <- (W/sum(obslist[[j]]$w))*obslist[[j]]$w
      rm(pw,sw,W) # so no accidental duplicates
    }
    tmp <- do.call(rbind,obslist)
    stopifnot(abs(1 - sum(tmp$w)) < .1)
    # sometimes parties only have 1 gender, in which case this is slightly off
    # not much we can do, but the size of these errors are small.
    # reweight in such cases -- always women need to be upweighted
    if(all.equal(sum(tmp$w),1) != T){
      diff <- 1-sum(tmp$w)
      tmp$w[which(tmp$sex == "Female")] <-
        (tmp$w[which(tmp$sex == "Female")] +
           diff/nrow(tmp[which(tmp$sex == "Female"),]))
      tmp$w <- tmp$w*(1/sum(tmp$w))
      stopifnot(abs(ww - sum(tmp$w[which(tmp$sex == "Female")])) < .05)
      rm(diff)
    }
    stopifnot(all.equal(sum(tmp$w),1))
    eval(parse(text=paste("tmp$w_",names(swel)[k]," <- tmp$w",sep="")))
    # merge
    eval(parse(text=paste("tmp <- tmp[,c('id','w_",names(swel)[k],"')]",sep="")))
    elist[[i]] <- merge(elist[[i]], tmp, by = "id", all.x = T, all.y = T)
    rm(mw,ww,tmp,obslist)
  }
}
swel <- do.call(rbind, elist)
swel$id <- NULL
swel <- swel[order(swel$country, swel$year),]
# checkpoint elites again
write.csv(swel, "./final/final-sweden-elite.csv", row.names = F)
rm(p,elist,w,i,j,k,kk,nn)

### read in mass data and clean
files <- list.files(path = "./SNES", pattern = "\\.xls$|\\.XLSX$", recursive = T)
files <- paste("./SNES/",files,sep="")
data.list <- vector("list", length(files))
for(i in 1:length(data.list)){
  data.list[[i]] <- read_excel(files[i])
  data.list[[i]] <- as.data.frame(data.list[[i]])
}
# 1985
data.list[[1]] <- data.list[[1]][,c("V201","V256","V257","V258","V259","V260",
                                    "V316","V319","V99","V102","V106","V107",
                                    "V111")]
names(data.list[[1]]) <- c("ideology","info1","info2","info3","info4","info5",
                           "occupation","education","public_sector","defense_spending",
                           "health_private","pornography","working_day")
data.list[[1]]$year <- rep("1985",nrow(data.list[[1]]))
data.list[[1]]$reduce_inequality <- data.list[[1]]$ban_driving <- 
  data.list[[1]]$fewer_refugees <- data.list[[1]]$nato <- rep(NA,nrow(data.list[[1]]))
# 1988
data.list[[2]] <- data.list[[2]][,c("V178","V229","V230","V231","V232","V233",
                                    "V289","V299","V112","V114","V123","V126",
                                    "V121","V119")]
names(data.list[[2]]) <- c("ideology","info1","info2","info3","info4","info5",
                           "occupation","education","public_sector","defense_spending",
                           "health_private","working_day","reduce_inequality",
                           "ban_driving")
data.list[[2]]$year <- rep("1988",nrow(data.list[[2]]))
data.list[[2]]$pornography <- data.list[[2]]$fewer_refugees <- 
  data.list[[2]]$nato <- rep(NA,nrow(data.list[[2]]))
# 1991
data.list[[3]] <- data.list[[3]][,c("v217","v283","v284","v285","v286","v287",
                                    "v288","v289","v356","v359","v125","v127",
                                    "v143","v137","v134","v130")]
names(data.list[[3]]) <- c("ideology","info1","info2","info3","info4","info5",
                           "info6","info7","occupation","education",
                           "public_sector","defense_spending","health_private",
                           "pornography","working_day","reduce_inequality")
data.list[[3]]$year <- rep("1991",nrow(data.list[[3]]))
data.list[[3]]$ban_driving <- data.list[[3]]$fewer_refugees <- 
  data.list[[3]]$nato <- rep(NA,nrow(data.list[[3]]))
# 1994
data.list[[4]] <- data.list[[4]][,c("v212","v295","v296","v297","v298","v299",
                                    "v300","v361","v368","v126","v127","v134",
                                    "v137","v135","v131","v142","v140")]
names(data.list[[4]]) <- c("ideology","info1","info2","info3","info4","info5",
                           "info6","occupation","education","public_sector", 
                           "defense_spending", "health_private", "pornography", 
                           "working_day", "reduce_inequality", "ban_driving", 
                           "fewer_refugees")
data.list[[4]]$year <- rep("1994",nrow(data.list[[4]]))
data.list[[4]]$nato <- rep(NA,nrow(data.list[[4]]))
# 1998
data.list[[5]] <- data.list[[5]][,c("v366","v219","v220","v221","v222","v223",
                                    "v224","v225","v380","v384","v137","v138",
                                    "v145","v148","v147","v142","v154","v151",
                                    "v157")]
names(data.list[[5]]) <- c("ideology","info1","info2","info3","info4","info5",
                           "info6","info7","occupation","education",
                           "public_sector", "defense_spending", "health_private", 
                           "pornography", "working_day", "reduce_inequality", 
                           "ban_driving", "fewer_refugees", "nato")
data.list[[5]]$year <- rep("1998",nrow(data.list[[5]]))
# 2002
data.list[[6]] <- data.list[[6]][,c("V247","V291","V292","V293","V294","V295",
                                    "V296","V297","V298","V490","V500","V147",
                                    "V148","V154","V161","V160","V153","V167")]
names(data.list[[6]]) <- c("ideology","info1","info2","info3","info4","info5",
                           "info6","info7","info8","occupation","education",
                           "public_sector", "defense_spending", "health_private", 
                           "pornography", "working_day", "reduce_inequality", 
                           "ban_driving")
data.list[[6]]$year <- rep("2002",nrow(data.list[[6]]))
data.list[[6]]$fewer_refugees <- data.list[[6]]$nato <- rep(NA,nrow(data.list[[6]]))
# 2006
data.list[[7]] <- data.list[[7]][,c("V596","V611","V612","V613","V614","V615",
                                    "V616","V617","V618","V765","V772","V572",
                                    "V571","V576")]
names(data.list[[7]]) <- c("ideology","info1","info2","info3","info4","info5",
                           "info6","info7","info8","occupation","education",
                           "pornography","working_day","ban_driving")
data.list[[7]]$year <- rep("2006",nrow(data.list[[7]]))
data.list[[7]]$public_sector <- data.list[[7]]$defense_spending <- 
  data.list[[7]]$health_private <- data.list[[7]]$reduce_inequality <- 
  data.list[[7]]$fewer_refugees <- data.list[[7]]$nato <- rep(NA,nrow(data.list[[7]]))
# 2010
data.list[[8]] <- data.list[[8]][,c("VU10_V885","VU10_V925","VU10_V926",
                                    "VU10_V927","VU10_V928","VU10_V929",
                                    "VU10_V930","VU10_V931","VU10_V932",
                                    "VU10_V933","VU10_V1096","VU10_V1116",
                                    "VU10_V866","VU10_V865")]
names(data.list[[8]]) <- c("ideology","info1","info2","info3","info4","info5",
                           "info6","info7","info8","info9","occupation","education",
                           "pornography","working_day")
data.list[[8]]$year <- rep("2010",nrow(data.list[[8]]))
data.list[[8]]$public_sector <- data.list[[8]]$defense_spending <- 
  data.list[[8]]$health_private <- data.list[[8]]$reduce_inequality <- 
  data.list[[8]]$ban_driving <- data.list[[8]]$fewer_refugees <- 
  data.list[[8]]$nato <- rep(NA,nrow(data.list[[8]]))
# merge them
for(i in 1:length(data.list)){
  if(ncol(data.list[[i]]) == 18){
    data.list[[i]]$info6 <- data.list[[i]]$info7 <- data.list[[i]]$info8 <-
      data.list[[i]]$info9 <- rep(NA,nrow(data.list[[i]]))
  }
  if(ncol(data.list[[i]]) == 19){
    data.list[[i]]$info7 <- data.list[[i]]$info8 <- data.list[[i]]$info9 <- 
      rep(NA,nrow(data.list[[i]]))
  }
  if(ncol(data.list[[i]]) == 20){
    data.list[[i]]$info8 <- data.list[[i]]$info9 <- rep(NA,nrow(data.list[[i]]))
  }
  if(ncol(data.list[[i]]) == 21){
    data.list[[i]]$info9 <- rep(NA,nrow(data.list[[i]]))
  }
  data.list[[i]] <- data.list[[i]][,c("year","ideology","info1","info2","info3",
                                      "info4","info5","info6","info7","info8",
                                      "info9","occupation","education",
                                      "public_sector", "defense_spending", 
                                      "health_private", "pornography", 
                                      "working_day", "reduce_inequality", 
                                      "ban_driving", "fewer_refugees", "nato")]
  for(j in 1:ncol(data.list[[i]])){
    data.list[[i]][,j] <- as.character(data.list[[i]][,j])
  }
}
sw <- do.call(rbind, data.list)
rm(data.list,files)
# meta info
sw$country <- rep("Sweden", nrow(sw))
sw$income <- rep(NA, nrow(sw))
# clean ideology
sw$ideology[which(sw$ideology %in% c("77", "88", "8888"))] <- NA
# clean education
sw$education[which(sw$education %in% c("77", "88", "8888"))] <- NA
sw$education[which(sw$education == "8" & 
                     sw$year %in% c("1985", "1988", "1991"))] <- NA
sw$education[which(sw$education == "9" & sw$year == "1985")] <- NA
# clean occupation
sw$occupation[which(sw$occupation %in% c("88", "96", "99", "8888", "9000", 
                                         "9995", "9998", "9999"))] <- NA
sw$occupation[which(sw$occupation == "4")] <- "worker"
sw$occupation[which(sw$occupation %in% c("2", "3"))] <- "professional"
sw$occupation[which(!(sw$occupation %in% c("worker", "professional", NA)))] <- 
  "other"
# clean information variables
infor <- which(grepl("info",names(sw)))
for(i in infor){
  sw[which(sw[,i] %in% c("6", "7", "8", "9","8888")),i] <- NA
}
sw$info1[which(sw$year %in% c("1985","1988","1991") & sw$info1 == "1")] <- "correct"
sw$info1[which(sw$year %in% c("1994","1998","2002","2006","2010") & sw$info1 == "5")] <- "correct"
sw$info2[which(sw$year %in% c("1994","1998","2002","2006","2010") & sw$info2 == "1")] <- "correct"
sw$info2[which(sw$year %in% c("1985","1988","1991") & sw$info2 == "5")] <- "correct"
sw$info3[which(sw$year %in% c("1985","1988","1991","2002","2006") & sw$info3 == "1")] <- "correct"
sw$info3[which(sw$year %in% c("1994","1998","2010") & sw$info3 == "5")] <- "correct"
sw$info4[which(sw$year %in% c("1991","1994","1998","2006") & sw$info4 == "1")] <- "correct"
sw$info4[which(sw$year %in% c("1985","1988","2002","2010") & sw$info4 == "5")] <- "correct"
sw$info5[which(sw$year %in% c("2002","2006","2010") & sw$info5 == "1")] <- "correct"
sw$info5[which(sw$year %in% c("1985","1988","1991","1994","1998") & sw$info5 == "5")] <- "correct"
sw$info6[which(sw$year %in% c("1994","1998","2006","2010") & sw$info6 == "1")] <- "correct"
sw$info6[which(sw$year %in% c("1985","1988","1991","2002") & sw$info6 == "5")] <- "correct"
sw$info7[which(sw$year %in% c("1991","2002") & sw$info7 == "1")] <- "correct"
sw$info7[which(sw$year %in% c("1985","1988","1994","1998","2006","2010") & sw$info7 == "5")] <- "correct"
sw$info8[which(sw$year %in% c("2002","2006","2010") & sw$info8 == "1")] <- "correct"
sw$info8[which(sw$year %in% c("1985","1988","1991","1994","1998") & sw$info8 == "5")] <- "correct"
sw$info9[which(sw$year %in% c("2010") & sw$info9 == "5")] <- "correct"
for(i in infor){
  sw[which(!(sw[,i] %in% c("correct",NA))),i] <- "incorrect"
}
# recode knowledge to 0/1 now that it's clean
for(i in infor){
  sw[which(sw[,i] == "correct"),i] <- "1"
  sw[which(sw[,i] == "incorrect"),i] <- "0"
}
# split into each country-year
templist <- split(sw, f = sw$year)
knowledge <- which(grepl("info", colnames(sw)))
# factor information
for(i in 1:length(templist)){
  print(i)
  # get the temporary dataframe
  templist[[i]]$id <- rownames(templist[[i]])
  knowmat <- cbind(templist[[i]]$id,templist[[i]][,knowledge])
  colnames(knowmat)[1] <- "id"
  # remove all NA columns
  for(j in ncol(knowmat):2){       # 2 since column 1 is IDs
    if(all(is.na(unique(knowmat[,j]))) == T){
      knowmat <- knowmat[,-j]
    }
  }
  # if no info questions were asked, we skip the country-year
  if(is.null(dim(knowmat)) == T){
    templist[[i]]$knowledge <- rep(NA,nrow(templist[[i]]))
    templist[[i]] <- templist[[i]][,c("country","year","ideology","occupation",
                                      "education","income","knowledge",
                                      "public_sector", "defense_spending", 
                                      "health_private", "pornography", 
                                      "working_day", "reduce_inequality", 
                                      "ban_driving", "fewer_refugees", "nato")]
    next
  }
  # take only complete cases and factor -- sometimes this eliminates all obs,
  # so drop one question in that case
  kmat <- knowmat[complete.cases(knowmat),]
  if(dim(kmat)[2] == 2){
    colnames(knowmat)[2] <- "knowledge"
    stopifnot(nrow(templist[[i]]) == nrow(knowmat))
    knowmat$knowledge <- as.character(knowmat$knowledge)
    knowmat$knowledge[which(knowmat$knowledge == "1")] <- "4"
    templist[[i]]$knowledge <- knowmat$knowledge
    templist[[i]] <- templist[[i]][,c("country","year","ideology","occupation",
                                      "education","income","knowledge",
                                      "public_sector", "defense_spending", 
                                      "health_private", "pornography", 
                                      "working_day", "reduce_inequality", 
                                      "ban_driving", "fewer_refugees", "nato")]
    next
  }
  # MCA
  facs <- MCA(kmat[,-1], ncp = 5, graph = F)
  # save first factor score
  kmat$knowledge <- facs$ind$coord[,1]
  knowmat <- kmat
  # need to make sure scales are in the same direction -- make numeric,
  # sort by sum of 1's, then flip axes as necessary
  for(j in 2:(ncol(knowmat)-1)){
    knowmat[,j] <- as.numeric(as.character(knowmat[,j]))
  }
  knowmat <- knowmat[order(rowSums(knowmat[,
                                           2:(ncol(knowmat)-1)]),decreasing=T),]
  if(as.logical(head(knowmat$knowledge, 1) < tail(knowmat$knowledge, 1)) == T){
    knowmat$knowledge <- knowmat$knowledge*(-1)
  }
  # re-merge and store metadata, matching on ID
  templist[[i]] <- merge(templist[[i]],knowmat, by = "id", all.x = T, all.y = T)
  templist[[i]] <- templist[[i]][,c("country","year","ideology","occupation",
                                    "education","income","knowledge",
                                    "public_sector", "defense_spending", 
                                    "health_private", "pornography", 
                                    "working_day", "reduce_inequality", 
                                    "ban_driving", "fewer_refugees", "nato", "id")]
  # now we need to quantize
  tmpdf <- templist[[i]]
  tmpdf$knowledge <- jitter(as.numeric(tmpdf$knowledge), factor=10e-8)
  # jitter to allow quantization
  knowqnt <- as.character(qcut(tmpdf$knowledge,5)) # just to get labels
  stopifnot(mean(tmpdf$knowledge[which(knowqnt == max(as.numeric(knowqnt),
                                                      na.rm=T))]) >
              mean(tmpdf$knowledge[which(knowqnt == min(as.numeric(knowqnt),
                                                        na.rm=T))]))
  # make sure factor levels align
  tmpdf$knowledge <- knowqnt
  tmpdf <- tmpdf[,c("id","knowledge")]
  templist[[i]] <- merge(templist[[i]], tmpdf, by = "id", all.x = T, all.y = T)
  templist[[i]]$knowledge.x <- NULL
  stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "knowledge.y")
  colnames(templist[[i]])[ncol(templist[[i]])] <- "knowledge"
  templist[[i]] <- templist[[i]][,c("country","year","ideology","occupation",
                                    "education","income","knowledge",
                                    "public_sector", "defense_spending", 
                                    "health_private", "pornography", 
                                    "working_day", "reduce_inequality", 
                                    "ban_driving", "fewer_refugees", "nato")]
}
# quantize education
for(i in 1:length(templist)){
  print(i)
  # assign IDs and get temp df
  templist[[i]]$id <- rownames(templist[[i]])
  tmpdf <- templist[[i]][,c("id","education")]
  if(all(is.na(unique(templist[[i]]$education))) == T){
    next
  } else {
    # give everyone an education quantile. break ties with random assignment
    # through jitter. jitter shouldn't matter since it will never separate top
    # and bottom quintiles, which is what we care about
    tmpdf$education <- jitter(as.numeric(tmpdf$education), factor=10e-8)
    eduqnt <- as.character(qcut(tmpdf$education,5)) # just to get labels
    stopifnot(mean(tmpdf$education[which(eduqnt == max(as.numeric(eduqnt),
                                                       na.rm=T))]) >
                mean(tmpdf$education[which(eduqnt == min(as.numeric(eduqnt),
                                                         na.rm=T))]))
    # make sure factor levels align
    tmpdf$education <- eduqnt
    templist[[i]] <- merge(templist[[i]],tmpdf, by = "id", all.x = T, all.y = T)
    templist[[i]]$education.x <- NULL
    stopifnot(colnames(templist[[i]])[ncol(templist[[i]])] == "education.y")
    colnames(templist[[i]])[ncol(templist[[i]])] <- "education"
  }
}
sw <- do.call(rbind, templist)
sw$id <- NULL; row.names(sw) <- NULL
# fill in variables
sw$survey <- rep("SNES", nrow(sw))
sw$wealth <- rep(NA,nrow(sw))
# recode policy questions
iss <- which(names(sw) %in% c("public_sector", "defense_spending", 
                              "health_private", "pornography", "working_day", 
                              "reduce_inequality", "ban_driving", "fewer_refugees", 
                              "nato"))
for(i in iss){
  sw[which(sw[,i] == "Mycket bra förslag"),i] <- "4"
  sw[which(sw[,i] == "Ganska bra förslag"),i] <- "3"
  sw[which(sw[,i] == "Varken bra eller dåligt förslag"),i] <- "2"
  sw[which(sw[,i] == "Ganska dåligt förslag"),i] <- "1"
  sw[which(sw[,i] == "Mycket dåligt förslag"),i] <- "0"
  sw[which(sw[,i] == "8"),i] <- NA
  sw[which(sw[,i] == "8888"),i] <- NA
}
for(i in iss){
  sw[,i] <- as.numeric(sw[,i])
}
sw$ideology <- as.numeric(sw$ideology)
# make "0" the "left" position
sw$public_sector <- -1*(sw$public_sector - 4)
sw$defense_spending <- -1*(sw$defense_spending - 4)
sw$working_day <- -1*(sw$working_day - 4)
sw$reduce_inequality <- -1*(sw$reduce_inequality - 4)
sw$ban_driving <- -1*(sw$ban_driving - 4)
# save
sw <- sw[order(sw$country,sw$year),
         c("country","year","survey","ideology","public_sector", 
           "defense_spending", "health_private", "pornography", "working_day", 
           "reduce_inequality", "ban_driving", "fewer_refugees", "nato",
           "wealth", "occupation","education","income","knowledge")]
write.csv(sw, "./final/final-sweden-mass.csv", row.names = F)

#### finally, let's output dyads
sw$mass_id <- seq(1,nrow(sw),1)
swel$elite_id <- seq(1,nrow(swel),1)
dyads <- data.frame(matrix(ncol=(ncol(sw)+ncol(swel)),nrow=0))
colnames(dyads) <- c(paste("m",colnames(sw),sep="_"),
                     paste("e",colnames(swel),sep="_"))
colnames(sw) <- c(paste("m",colnames(sw),sep="_"))
colnames(swel) <- c(paste("e",colnames(swel),sep="_"))
# set up temp storage
mlist <- as.list(NULL)
elist <- as.list(NULL)
for(i in 1:nrow(df.final)){
  print(i)
  mlist[[i]] <- sw[which(sw$m_country == df.final$country[i] &
                           sw$m_year == df.final$year[i]),]
  elist[[i]] <- swel[which(swel$e_country == df.final$country[i] &
                             swel$e_year ==df.final$year[i]),]
}
# create all dyads within matching country-years
for(i in 1:length(mlist)){
  print(i)
  mdata <- mlist[[i]]
  edata <- elist[[i]]
  
  if(nrow(edata) == 0 | nrow(mdata) == 0){
    next
  } else {
    dyads <- rbind(dyads,expand.grid.df(mdata,edata))
  }
}
# create some DVs we care about - divide through so it's on the same 0-1 scale
dyads$emd_ideology <- abs(dyads$m_ideology - dyads$e_ideology)/2 
dyads$emd_public_sector <- abs(dyads$m_public_sector - dyads$e_public_sector)/5
dyads$emd_defense_spending <- abs(dyads$m_defense_spending - dyads$e_defense_spending)/5
dyads$emd_health_private <- abs(dyads$m_health_private - dyads$e_health_private)/5
dyads$emd_pornography <- abs(dyads$m_pornography - dyads$e_pornography)/5
dyads$emd_working_day <- abs(dyads$m_working_day - dyads$e_working_day)/5
dyads$emd_reduce_inequality <- abs(dyads$m_reduce_inequality - dyads$e_reduce_inequality)/5
dyads$emd_ban_driving <- abs(dyads$m_ban_driving - dyads$e_ban_driving)/5
dyads$emd_fewer_refugees <- abs(dyads$m_fewer_refugees - dyads$e_fewer_refugees)/5
dyads$emd_nato <- abs(dyads$m_nato - dyads$e_nato)/5
# export
write.csv(dyads, "./final/sweden-dyads.csv",row.names=F)
rm(list=ls()[!(ls() %in% c("completeFun","expand.grid.df",
                           "qcut","rescalr","simpleCap","df.final"))])

##### VERSION CONTROL #####
sessionInfo()
# R version 3.5.1 (2018-07-02)
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS High Sierra 10.13.6
# 
# Matrix products: default
# BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
# LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
# 
# locale:
# [1] en_US/en_US/en_US/C/en_US/en_US.UTF-8
# 
# attached base packages:
# [1] grid      stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
# [1] nloptr_1.2.1      readstata13_0.9.2 readxl_1.3.1      arm_1.10-1       
# [5] lme4_1.1-21       Matrix_1.2-17     gridExtra_2.3     ggplot2_3.1.1    
# [9] MASS_7.3-51.4     reshape2_1.4.3    haven_2.1.0       repmis_0.5       
# [13] FactoMineR_1.41   plyr_1.8.4        emdist_0.3-2      stringr_1.4.0    
# [17] WDI_2.6.0         foreign_0.8-71   
# 
# loaded via a namespace (and not attached):
# [1] tidyselect_0.2.5     purrr_0.3.2          splines_3.5.1       
# [4] lattice_0.20-38      colorspace_1.4-1     rlang_0.3.4         
# [7] R.oo_1.22.0          pillar_1.4.1         glue_1.3.1          
# [10] withr_2.1.2          R.utils_2.8.0        R.cache_0.13.0      
# [13] cellranger_1.1.0     munsell_0.5.0        gtable_0.3.0        
# [16] R.methodsS3_1.7.1    leaps_3.0            coda_0.19-2         
# [19] forcats_0.4.0        curl_3.3             Rcpp_1.0.1          
# [22] readr_1.3.1          scales_1.0.0         flashClust_1.01-2   
# [25] scatterplot3d_0.3-41 abind_1.4-5          hms_0.4.2           
# [28] digest_0.6.19        stringi_1.4.3        dplyr_0.8.1         
# [31] RJSONIO_1.3-1.2      cli_1.1.0            tools_3.5.1         
# [34] magrittr_1.5         lazyeval_0.2.2       tibble_2.1.2        
# [37] cluster_2.0.9        crayon_1.3.4         pkgconfig_2.0.2     
# [40] ellipsis_0.1.0       data.table_1.12.2    assertthat_0.2.1    
# [43] minqa_1.2.4          httr_1.4.0           rstudioapi_0.10     
# [46] R6_2.4.0             boot_1.3-22          nlme_3.1-140        
# [49] compiler_3.5.1